In [2]:
#Import necessary modules.
import pandas as pd
import numpy as np
from thunderpack import ThunderReader  
from tqdm import tqdm


In [3]:
#Read in ICD positive data for BIDMC.
ICDs = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/patientIDs_ICD_plus_SDH_BI.csv")

In [4]:
#Read in the MGB and BIDMC combined feature matrix with CPT Codes
matrix=pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/MGB_BIDMC_CPT_Feature_Matrix_.csv')

In [5]:
#Ensure that the feature matrix added correctly
print(len(matrix))
print(matrix.columns)

3000
Index(['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI',
       'acut sdh_pos', 'brain injuri_pos', 'brain mri_pos', 'burr hole_pos',
       'stabl sdh_pos', 'craniectomi_pos', 'craniotomi_pos', 'ct head_pos',
       'drainag_pos', 'evacu_pos', 'head ct_pos', 'hematoma_pos',
       'chronic sdh_pos', 'herniat_pos', 'intracrani hemorrhag_pos',
       'intraparenchym hemorrhag_pos', 'midlin shift_pos', 'mva_pos',
       'mvc_pos', 'neurosurgeri_pos', 'neurosurg intervent_pos', 'scan_pos',
       'sdh_pos', 'subdur_pos', 'tbi_pos', 'tentorium_pos', 'thick_pos',
       'trauma_pos', 'prior sdh_pos', 'recent sdh_pos', 'resolv sdh_pos',
       'known sdh_pos', 'acut sdh_neg', 'brain injuri_neg', 'brain mri_neg',
       'burr hole_neg', 'stabl sdh_neg', 'craniectomi_neg', 'craniotomi_neg',
       'ct head_neg', 'drainag_neg', 'evacu_neg', 'head ct_neg',
       'hematoma_neg', 'chronic sdh_neg', 'herniat_neg',
       'intracrani hemorrhag_neg', 'intraparenchym hemorrhag

In [6]:
#Ensure that the BIDMC ICD added correctly
print(ICDs.head())
print(len(ICDs))

   BDSPPatientID AdmissionDate  BDSPEncounterID DiagnosisCodeWithDots
0      150077337    2011-02-14        132846724                852.21
1      150077456    2011-12-07        132860169                852.21
2      150077456    2012-01-07        132867265                432.1 
3      150077456    2012-01-25        132871386                432.1 
4      150063299    2013-06-23        132979960                852.21
13492


In [7]:
#Check which positive ICD's have matching BDSPPatientIDs in the feature matrix 
ICDs = ICDs[ICDs['BDSPPatientID'].isin(set(matrix['BDSPPatientID']))]
ICDs['ICD Date'] = pd.to_datetime(ICDs['AdmissionDate']).dt.strftime('%Y-%m-%d')
ICDs = ICDs[['BDSPPatientID', 'ICD Date','DiagnosisCodeWithDots']]
ICDs.head()

Unnamed: 0,BDSPPatientID,ICD Date,DiagnosisCodeWithDots
5,150000139,2013-10-02,432.1
17,150000139,2013-10-02,432.1
18,150000139,2013-11-13,432.1
32,150000139,2013-11-11,432.1
71,150001368,2010-12-04,432.1


In [8]:
#Make sure that there are only 750 unique positive BIDMC patient IDs
print(ICDs['BDSPPatientID'].nunique())

750


In [9]:
#Verify length of ICDs
print(len(ICDs))

3086


In [10]:
# Import MGB ICD positive CSV
ICDs_MGB = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/patientIDs_ICD_plus_SDH_MGB.csv')
ICDs_MGB['BDSPPatientID'] = ICDs_MGB['BDSPPatientID'].astype(int)

# Filter ICDs_MGB based on BDSPPatientID in matrix
ICDs_MGB = ICDs_MGB[ICDs_MGB['BDSPPatientID'].isin(set(matrix['BDSPPatientID']))]

# Convert ShiftedContactDTS to datetime and format it as 'YYYY-MM-DD'
ICDs_MGB['ICD Date'] = pd.to_datetime(ICDs_MGB['ShiftedContactDTS'], format='%Y-%m-%d %H:%M:%S.%f').dt.strftime('%Y-%m-%d')

# Select necessary columns BDSPPatientID and ICD Date
ICDs_MGB = ICDs_MGB[['BDSPPatientID', 'ICD Date', 'ICDCD']].rename(columns={'ICDCD': 'DiagnosisCodeWithDots'})

# Print the head of the final DataFrame to ensure the above code worked
ICDs_MGB.head()




Unnamed: 0,BDSPPatientID,ICD Date,DiagnosisCodeWithDots
4,112012057,2022-09-30,432.1
18,115840304,2021-08-10,432.1
19,116269678,2022-10-16,432.1
21,116863283,2017-04-02,432.1
24,119488118,2017-05-12,432.1


In [11]:
#Making sure the above code worked and that there are 750 unique patient ids
print(ICDs_MGB.head())
print(len(ICDs_MGB))
print(ICDs_MGB['BDSPPatientID'].nunique()) 

    BDSPPatientID    ICD Date DiagnosisCodeWithDots
4       112012057  2022-09-30                 432.1
18      115840304  2021-08-10                 432.1
19      116269678  2022-10-16                 432.1
21      116863283  2017-04-02                 432.1
24      119488118  2017-05-12                 432.1
14614
750


In [12]:
# Concatenating the ICD positive lists together
ICDs_compl=[ICDs,ICDs_MGB]
ICDs_complete=pd.concat(ICDs_compl, ignore_index=True)

In [13]:
# Making sure the above code worked. 
print(ICDs_complete.head())
print(len(ICDs_complete))
print(ICDs_complete['BDSPPatientID'].nunique())

   BDSPPatientID    ICD Date DiagnosisCodeWithDots
0      150000139  2013-10-02                432.1 
1      150000139  2013-10-02                432.1 
2      150000139  2013-11-13                432.1 
3      150000139  2013-11-11                432.1 
4      150001368  2010-12-04                432.1 
17700
1500


In [14]:
# Convert Contact Date to date time format
matrix['ContactDate']= pd.to_datetime(matrix['ContactDate']) 
ICDs_complete['ICD Date'] = pd.to_datetime(ICDs_complete['ICD Date'])

In [21]:
#Filtering ICDs based on a time windox of 5 months from the contact date.
ICDs_complete = ICDs_complete.rename(columns={'ICD Date': 'ContactDate'})
ICDs_complete['ContactDate'] = pd.to_datetime(ICDs_complete.ContactDate)
ICDs_complete['DiagnosisCodeWithDots'] = ICDs_complete.DiagnosisCodeWithDots.astype(str)

icd_codes = ['I62.0', 'S06.5', '432.1', '852.2', '852.3']
icd_codes_regex = [r'^I62\.0', r'^S06\.5', r'^432\.1', r'^(?:852\.2|852\.3)']
icd_codes_col_names = ['ICD_I62.0', 'ICD_S06.5', 'ICD_432.1', 'ICD_852.2or3']
for col in icd_codes_col_names:
    matrix[col] = np.nan


for i in tqdm(range(len(matrix))):
    patientid = matrix.BDSPPatientID.iloc[i]
    note_date = matrix.ContactDate.iloc[i]
    time_window_left = note_date - pd.DateOffset(months=6)
    time_window_right = note_date + pd.DateOffset(months=6)
    df_subset = ICDs_complete[ICDs_complete.BDSPPatientID == patientid]
    df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
    for regex, col in zip(icd_codes_regex, icd_codes_col_names):
        matrix.loc[i, col] = (df_subset['DiagnosisCodeWithDots'].str.contains(regex, case=False, regex=True).any()).astype(int)



  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_sub

  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_sub

In [23]:
#Convert column binary values so they are not 1.0 or 0.0 but 1 and 0. 
cols_to_convert = ['ICD_I62.0', 'ICD_S06.5', 'ICD_432.1', 'ICD_852.2or3']

for col in cols_to_convert:
    matrix[col] = matrix[col].astype(int)

matrix.head()

Unnamed: 0,BDSPPatientID,ContactDate,NoteFileName,Site,CT,MRI,acut sdh_pos,brain injuri_pos,brain mri_pos,burr hole_pos,...,trauma_neg,prior sdh_neg,recent sdh_neg,resolv sdh_neg,known sdh_neg,history_sdh,ICD_I62.0,ICD_S06.5,ICD_432.1,ICD_852.2or3
0,120109726,2019-12-01,Notes_13393227243_2508134861_20191201.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1,111971091,2021-05-30,Notes_13517098931_6064385669_20210530.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,114651683,2022-03-09,Notes_13621620103_9363901699_20220309.txt,MGB,0,1,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
3,115288640,2021-03-02,Notes_13554067774_6841967671_20210302.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
4,115389340,2019-12-02,Notes_13481394171_4791230696_20191202.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0


In [26]:
#Convert final feature matrix including CPT, ICD, and features to a csv
matrix.to_csv('feature_matrix_notes_CPT_and_ICD_.csv', index=False)
matrix.head()

Unnamed: 0,BDSPPatientID,ContactDate,NoteFileName,Site,CT,MRI,acut sdh_pos,brain injuri_pos,brain mri_pos,burr hole_pos,...,trauma_neg,prior sdh_neg,recent sdh_neg,resolv sdh_neg,known sdh_neg,history_sdh,ICD_I62.0,ICD_S06.5,ICD_432.1,ICD_852.2or3
0,120109726,2019-12-01,Notes_13393227243_2508134861_20191201.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1,111971091,2021-05-30,Notes_13517098931_6064385669_20210530.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,114651683,2022-03-09,Notes_13621620103_9363901699_20220309.txt,MGB,0,1,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
3,115288640,2021-03-02,Notes_13554067774_6841967671_20210302.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
4,115389340,2019-12-02,Notes_13481394171_4791230696_20191202.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0


In [27]:
#Purpose of this cell is to merge the newly formed matrix to the annotations
#Load in matrix
X = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/feature_matrix_notes_CPT_and_ICD_.csv')

#Load in Annotation csv
y_data_pre = pd.read_csv('/home/gregory178/Desktop/Annotation_Results/BIDMC_+_Minus_MGB_+_-.csv')
y = y_data_pre[['BDSPPatientID', 'annot','hospital']]

#Verify column names in X and y
print("Columns in X:", X.columns)
print("Columns in y:", y.columns)

#Merge X and y based on 'BDSPPatientID'
Xy = pd.merge(X, y, on='BDSPPatientID', how='inner', validate='1:1')
Xy.to_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/Complete_merged_feature_matrix_notes_CPT_and_ICD_.csv')

Columns in X: Index(['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI',
       'acut sdh_pos', 'brain injuri_pos', 'brain mri_pos', 'burr hole_pos',
       'stabl sdh_pos', 'craniectomi_pos', 'craniotomi_pos', 'ct head_pos',
       'drainag_pos', 'evacu_pos', 'head ct_pos', 'hematoma_pos',
       'chronic sdh_pos', 'herniat_pos', 'intracrani hemorrhag_pos',
       'intraparenchym hemorrhag_pos', 'midlin shift_pos', 'mva_pos',
       'mvc_pos', 'neurosurgeri_pos', 'neurosurg intervent_pos', 'scan_pos',
       'sdh_pos', 'subdur_pos', 'tbi_pos', 'tentorium_pos', 'thick_pos',
       'trauma_pos', 'prior sdh_pos', 'recent sdh_pos', 'resolv sdh_pos',
       'known sdh_pos', 'acut sdh_neg', 'brain injuri_neg', 'brain mri_neg',
       'burr hole_neg', 'stabl sdh_neg', 'craniectomi_neg', 'craniotomi_neg',
       'ct head_neg', 'drainag_neg', 'evacu_neg', 'head ct_neg',
       'hematoma_neg', 'chronic sdh_neg', 'herniat_neg',
       'intracrani hemorrhag_neg', 'intraparenchym 

In [28]:
#Now we are going to split the data into training and testing data randomly
df = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/Complete_merged_feature_matrix_notes_CPT_and_ICD_.csv')

#Randomly Sample 1500 rows
sampled_df = df.sample(n=1500, random_state=13)

#Save randomly sampled data to CSV
sampled_df.to_csv('train_data_.csv', index=False)

#Save remaining data to CSV as testing data
remaining_df = df.drop(sampled_df.index)
remaining_df.to_csv('test_data_.csv', index=False)

#Verify that the two data sets do not share any BDSPPatientIDs
train_ids = set(sampled_df['BDSPPatientID'])
test_ids = set(remaining_df['BDSPPatientID'])

shared_ids = train_ids.intersection(test_ids)

if len(shared_ids) == 0:
    print("Verification passed: No common BDSPPatientID between train_data_.csv and test_data_.csv")
else:
    print("Verification failed: Common BDSPPatientID found between train_data_.csv and test_data_.csv")
    print("Shared IDs:", shared_ids)

Verification passed: No common BDSPPatientID between train_data_.csv and test_data_.csv


In [29]:
#Code to improve the feature matrix. At one point, this was used to see which features to remove. 
#If less than 10 features were present, it was considered for removal.

features_before= sampled_df.loc[:, "CT":"ICD_852.2or3"]

features_before

total_counts = features_before.apply(lambda x: (x == 1).sum() + (x == -1).sum())

# Step 2: Determine if the total count of 1's and -1's is less than 20 for each column
for col, count in total_counts.items():
    if count < 10:
        print(f"For column '{col}', total count ({count}) of 1's and -1's is less than 10. Remove feature.")
    else:
        print(f"For column '{col}', total count ({count}) of 1's and -1's is 10 or more. Keep feature.")

For column 'CT', total count (746) of 1's and -1's is 10 or more. Keep feature.
For column 'MRI', total count (581) of 1's and -1's is 10 or more. Keep feature.
For column 'acut sdh_pos', total count (32) of 1's and -1's is 10 or more. Keep feature.
For column 'brain injuri_pos', total count (79) of 1's and -1's is 10 or more. Keep feature.
For column 'brain mri_pos', total count (75) of 1's and -1's is 10 or more. Keep feature.
For column 'burr hole_pos', total count (36) of 1's and -1's is 10 or more. Keep feature.
For column 'stabl sdh_pos', total count (20) of 1's and -1's is 10 or more. Keep feature.
For column 'craniectomi_pos', total count (37) of 1's and -1's is 10 or more. Keep feature.
For column 'craniotomi_pos', total count (189) of 1's and -1's is 10 or more. Keep feature.
For column 'ct head_pos', total count (319) of 1's and -1's is 10 or more. Keep feature.
For column 'drainag_pos', total count (177) of 1's and -1's is 10 or more. Keep feature.
For column 'evacu_pos', t

In [34]:
#Verifying that the values of pos_sdh and history_sdh are mutually exclusive. 

# 1. Count rows where both sdh_pos and history_sdh are 1
both_ones = df2[(df2['sdh_pos'] == 1) & (df2['history_sdh'] == 1)].shape[0]

# 2. Count rows where sdh_pos is 1 and history_sdh is 0
pos_one_neg_zero = df2[(df2['sdh_pos'] == 1) & (df2['history_sdh'] == 0)].shape[0]

# 3. Count rows where history_sdh is 1 and sdh_pos is 0
neg_one_pos_zero = df2[(df2['sdh_pos'] == 0) & (df2['history_sdh'] == 1)].shape[0]

# 4. Count rows where both sdh_pos and history_sdh are 0
both_zeros = df2[(df2['sdh_pos'] == 0) & (df2['history_sdh'] == 0)].shape[0]

# Print the results
print(f"1. Rows where sdh_pos and history_sdh are both 1: {both_ones}")
print(f"2. Rows where sdh_pos is 1 and history_sdh is 0: {pos_one_neg_zero}")
print(f"3. Rows where history_sdh is 1 and sdh_pos is 0: {neg_one_pos_zero}")
print(f"4. Rows where sdh_pos and history_sdh are both 0: {both_zeros}")


1. Rows where sdh_pos and history_sdh are both 1: 0
2. Rows where sdh_pos is 1 and history_sdh is 0: 442
3. Rows where history_sdh is 1 and sdh_pos is 0: 40
4. Rows where sdh_pos and history_sdh are both 0: 1018
