In [63]:
# Import necessary modules
import pandas as pd
import numpy as np
from thunderpack import ThunderReader  # install thunderpack if not yet: pip install thunderpack
from tqdm import tqdm


In [64]:
# Read in ICD positive data for BIDMC
#Remember to change it back patientIDS
#The 2 in front was something I changed a while back but didn't want to take the time to rerun the code. 
ICDs = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/2patientIDs_ICD_plus_SDH_BI.csv")

In [65]:
# Read in the MGB and BIDMC combined feature matrix with CPT Codes
matrix=pd.read_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/MGB_BIDMC_CPT_Feature_Matrix_Draft_15.csv')

In [66]:
#Ensure that the feature matrix added correctly
print(len(matrix))
print(matrix.columns)

3000
Index(['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI',
       'acut sdh_pos', 'brain injuri_pos', 'brain mri_pos', 'burr hole_pos',
       'stabl sdh_pos', 'craniectomi_pos', 'craniotomi_pos', 'ct head_pos',
       'drainag_pos', 'evacu_pos', 'head ct_pos', 'hematoma_pos',
       'chronic sdh_pos', 'herniat_pos', 'intracrani hemorrhag_pos',
       'intraparenchym hemorrhag_pos', 'midlin shift_pos', 'mva_pos',
       'mvc_pos', 'neurosurgeri_pos', 'neurosurg intervent_pos', 'scan_pos',
       'sdh_pos', 'subdur_pos', 'tbi_pos', 'tentorium_pos', 'thick_pos',
       'trauma_pos', 'prior sdh_pos', 'recent sdh_pos', 'resolv sdh_pos',
       'known sdh_pos', 'acut sdh_neg', 'brain injuri_neg', 'brain mri_neg',
       'burr hole_neg', 'stabl sdh_neg', 'craniectomi_neg', 'craniotomi_neg',
       'ct head_neg', 'drainag_neg', 'evacu_neg', 'head ct_neg',
       'hematoma_neg', 'chronic sdh_neg', 'herniat_neg',
       'intracrani hemorrhag_neg', 'intraparenchym hemorrhag

In [67]:
#Ensure that the BIDMC ICD added correctly
print(ICDs.head())
print(len(ICDs))

   BDSPPatientID AdmissionDate  BDSPEncounterID DiagnosisCodeWithDots
0      150077337    2011-02-14        132846724                852.21
1      150077456    2011-12-07        132860169                852.21
2      150077456    2012-01-07        132867265                432.1 
3      150077456    2012-01-25        132871386                432.1 
4      150063299    2013-06-23        132979960                852.21
13492


In [68]:
#Check which positive ICD's have matching BDSPPatientIDs in the feature matrix 
ICDs = ICDs[ICDs['BDSPPatientID'].isin(set(matrix['BDSPPatientID']))]
ICDs['ICD Date'] = pd.to_datetime(ICDs['AdmissionDate']).dt.strftime('%Y-%m-%d')
ICDs = ICDs[['BDSPPatientID', 'ICD Date','DiagnosisCodeWithDots']]
ICDs.head()

Unnamed: 0,BDSPPatientID,ICD Date,DiagnosisCodeWithDots
5,150000139,2013-10-02,432.1
17,150000139,2013-10-02,432.1
18,150000139,2013-11-13,432.1
32,150000139,2013-11-11,432.1
71,150001368,2010-12-04,432.1


In [69]:
#Make sure that there are only 750 unique positive BIDMC patient IDs
print(ICDs['BDSPPatientID'].nunique())

750


In [70]:
#Verify length of ICDs
print(len(ICDs))

3086


In [71]:
# Import MGB ICD positive CSV
ICDs_MGB = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_dive/2patientIDs_ICD_plus_SDH_MGB.csv')

ICDs_MGB['BDSPPatientID'] = ICDs_MGB['BDSPPatientID'].astype(int)

# Filter ICDs_MGB based on BDSPPatientID in matrix
ICDs_MGB = ICDs_MGB[ICDs_MGB['BDSPPatientID'].isin(set(matrix['BDSPPatientID']))]

# Convert ShiftedContactDTS to datetime and format it as 'YYYY-MM-DD'
ICDs_MGB['ICD Date'] = pd.to_datetime(ICDs_MGB['ShiftedContactDTS'], format='%Y-%m-%d %H:%M:%S.%f').dt.strftime('%Y-%m-%d')

# Select necessary columns BDSPPatientID and ICD Date
ICDs_MGB = ICDs_MGB[['BDSPPatientID', 'ICD Date', 'ICDCD']].rename(columns={'ICDCD': 'DiagnosisCodeWithDots'})

# Print the head of the final DataFrame to ensure the above code worked
ICDs_MGB.head()




Unnamed: 0,BDSPPatientID,ICD Date,DiagnosisCodeWithDots
4,112012057,2022-09-30,432.1
18,115840304,2021-08-10,432.1
19,116269678,2022-10-16,432.1
21,116863283,2017-04-02,432.1
24,119488118,2017-05-12,432.1


In [72]:
#Making sure the above code worked and that there are 750 unique patient ids
print(ICDs_MGB.head())
print(len(ICDs_MGB))
print(ICDs_MGB['BDSPPatientID'].nunique()) 

    BDSPPatientID    ICD Date DiagnosisCodeWithDots
4       112012057  2022-09-30                 432.1
18      115840304  2021-08-10                 432.1
19      116269678  2022-10-16                 432.1
21      116863283  2017-04-02                 432.1
24      119488118  2017-05-12                 432.1
14614
750


In [73]:
# Concatenating the ICD positive lists together
ICDs_compl=[ICDs,ICDs_MGB]
ICDs_complete=pd.concat(ICDs_compl, ignore_index=True)

In [74]:
# Making sure the above code worked. 
print(ICDs_complete.head())
print(len(ICDs_complete))
print(ICDs_complete['BDSPPatientID'].nunique())

   BDSPPatientID    ICD Date DiagnosisCodeWithDots
0      150000139  2013-10-02                432.1 
1      150000139  2013-10-02                432.1 
2      150000139  2013-11-13                432.1 
3      150000139  2013-11-11                432.1 
4      150001368  2010-12-04                432.1 
17700
1500


In [75]:
# Convert Contact Date to date time format
matrix['ContactDate']= pd.to_datetime(matrix['ContactDate']) 
ICDs_complete['ICD Date'] = pd.to_datetime(ICDs_complete['ICD Date'])

In [76]:
# IMPORTANT:
# the code below adds ICD to the feature matrix
# but, it combines all ICDs into one

# Ideally, we would add ICD features separately for each code <--- this is the one we choose to do


"""  # the code below adds ICD to the feature matrix
# Merge the Feature Matrix and ICD code together
# Add 1 or 0 to new column in matrix that denotes whether an icd is present +- 6 months from note creation date
# Merge the dfs on patient ID
merged_df = pd.merge(matrix, ICDs_complete, on='BDSPPatientID')

#check for date timeline
merged_df['ICD'] = merged_df.apply(
    lambda row: (row['ContactDate'] >= row['ICD Date'] - pd.DateOffset(months=6)) and 
                (row['ContactDate'] <= row['ICD Date'] + pd.DateOffset(months=6)), axis=1)
matrix['ICD'] = matrix.apply(
    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['ICD']].shape[0] > 0, axis=1).astype(int)
"""



"  # the code below adds ICD to the feature matrix\n# Merge the Feature Matrix and ICD code together\n# Add 1 or 0 to new column in matrix that denotes whether an icd is present +- 6 months from note creation date\n# Merge the dfs on patient ID\nmerged_df = pd.merge(matrix, ICDs_complete, on='BDSPPatientID')\n\n#check for date timeline\nmerged_df['ICD'] = merged_df.apply(\n    lambda row: (row['ContactDate'] >= row['ICD Date'] - pd.DateOffset(months=6)) and \n                (row['ContactDate'] <= row['ICD Date'] + pd.DateOffset(months=6)), axis=1)\nmatrix['ICD'] = matrix.apply(\n    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['ICD']].shape[0] > 0, axis=1).astype(int)\n"

In [77]:
# read the ICD thunderpack for MGB
"""
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB') # read the thunderpack file
df_icd_mgb = []
for i in tqdm(range(1, 511)):    # create a for loop to loop over all keys: ICD_partition_1, ICD_partition_2, ... You can find all keys by `print(list(reader.keys()))`
    df = reader[f'ICD_partition_{i}']
    df = df[['BDSPPatientID', 'ShiftedContactDTS', 'ICDCD']]
    df_icd_mgb.append(df)
df_icd_mgb = pd.concat(df_icd_mgb, axis=0, ignore_index=True)  # concatenate all filtered dataframes from all partitions into one dataframe
df_icd_mgb.to_csv('all_ICDs_MGB_3columns.csv', index=False)
"""


"\nreader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB') # read the thunderpack file\ndf_icd_mgb = []\nfor i in tqdm(range(1, 511)):    # create a for loop to loop over all keys: ICD_partition_1, ICD_partition_2, ... You can find all keys by `print(list(reader.keys()))`\n    df = reader[f'ICD_partition_{i}']\n    df = df[['BDSPPatientID', 'ShiftedContactDTS', 'ICDCD']]\n    df_icd_mgb.append(df)\ndf_icd_mgb = pd.concat(df_icd_mgb, axis=0, ignore_index=True)  # concatenate all filtered dataframes from all partitions into one dataframe\ndf_icd_mgb.to_csv('all_ICDs_MGB_3columns.csv', index=False)\n"

In [78]:
# read the ICD thunderpack for BIDMC
"""
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')
df_icd_bidmc = []
for i in tqdm(range(1, (len(list(reader.keys()))+1))):  
    df = reader[f'ICD_partition_{i}']
    df = df.rename(columns={'AdmissionDate':'ShiftedContactDTS', 'DiagnosisCodeWithDots':'ICDCD'})[['BDSPPatientID', 'ShiftedContactDTS', 'ICDCD']]
    df_icd_bidmc.append(df)
df_icd_bidmc = pd.concat(df_icd_bidmc, axis=0, ignore_index=True) 
df_icd_bidmc.to_csv('all_ICDs_BIDMC_3columns.csv', index=False)
"""


"\nreader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')\ndf_icd_bidmc = []\nfor i in tqdm(range(1, (len(list(reader.keys()))+1))):  \n    df = reader[f'ICD_partition_{i}']\n    df = df.rename(columns={'AdmissionDate':'ShiftedContactDTS', 'DiagnosisCodeWithDots':'ICDCD'})[['BDSPPatientID', 'ShiftedContactDTS', 'ICDCD']]\n    df_icd_bidmc.append(df)\ndf_icd_bidmc = pd.concat(df_icd_bidmc, axis=0, ignore_index=True) \ndf_icd_bidmc.to_csv('all_ICDs_BIDMC_3columns.csv', index=False)\n"

In [79]:
"""
import csv

def append_csv(MGB_csv, BIDMC_csv):
    with open(MGB_csv, 'a', newline='') as f_large, open(BIDMC_csv, 'r', newline='') as f_small:
        reader = csv.reader(f_small)
        writer = csv.writer(f_large)
        
        # Skip the header of small.csv
        next(reader, None)
        
        # Append each row from small.csv to large.csv
        for row in reader:
            writer.writerow(row)

# Paths to your CSV files
MGB_csv = '/home/gregory178/Desktop/NAX project/NAX_dive/all_ICDs_MGB_3columns.csv'
BIDMC_csv = '/home/gregory178/Desktop/NAX project/NAX_dive/all_ICDs_BIDMC_3columns.csv'

# Append BIDMC_csv to MGB_csv
append_csv(MGB_csv, BIDMC_csv)
"""


"\nimport csv\n\ndef append_csv(MGB_csv, BIDMC_csv):\n    with open(MGB_csv, 'a', newline='') as f_large, open(BIDMC_csv, 'r', newline='') as f_small:\n        reader = csv.reader(f_small)\n        writer = csv.writer(f_large)\n        \n        # Skip the header of small.csv\n        next(reader, None)\n        \n        # Append each row from small.csv to large.csv\n        for row in reader:\n            writer.writerow(row)\n\n# Paths to your CSV files\nMGB_csv = '/home/gregory178/Desktop/NAX project/NAX_dive/all_ICDs_MGB_3columns.csv'\nBIDMC_csv = '/home/gregory178/Desktop/NAX project/NAX_dive/all_ICDs_BIDMC_3columns.csv'\n\n# Append BIDMC_csv to MGB_csv\nappend_csv(MGB_csv, BIDMC_csv)\n"

In [80]:
ICDs_complete.head()

Unnamed: 0,BDSPPatientID,ICD Date,DiagnosisCodeWithDots
0,150000139,2013-10-02,432.1
1,150000139,2013-10-02,432.1
2,150000139,2013-11-13,432.1
3,150000139,2013-11-11,432.1
4,150001368,2010-12-04,432.1


In [81]:
"""
import pandas as pd

# Example function to parse datetime with multiple formats
def parse_datetime(date_str):
    formats_to_try = ["%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]
    
    for fmt in formats_to_try:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    
    # If none of the formats work, you can handle the error or return None
    return None

# Assuming df_icd_combined is already defined as in your code
# df_icd_combined = pd.concat([df_icd_bidmc, df_icd_mgb], axis=0, ignore_index=True)
# df_icd_combined = df_icd_combined.rename(columns={'ShiftedContactDTS': 'ContactDate'})

# Convert ContactDate using the custom function
df_icd_combined['ContactDate'] = df_icd_combined['ContactDate'].apply(parse_datetime)

# Now df_icd_combined['ContactDate'] will contain datetime objects or NaN where parsing failed

# Continue with your other operations
df_icd_combined['ICDCD'] = df_icd_combined['ICDCD'].astype(str)

icd_codes = ['I62.0', 'S06.5', '432.1', '852.2', '852.3']
icd_codes_regex = [r'^I62\.0', r'^S06\.5', r'^432\.1', r'^(?:852\.2|852\.3)']
icd_codes_col_names = ['ICD_I62.0', 'ICD_S06.5', 'ICD_432.1', 'ICD_852.2or3']

# Assuming 'matrix' is defined elsewhere
for col in icd_codes_col_names:
    matrix[col] = np.nan

# Loop over each row in 'matrix' and perform your operations
for i in tqdm(range(len(matrix))):
    patientid = matrix.loc[i, 'BDSPPatientID']
    note_date = matrix.loc[i, 'ContactDate']
    
    if pd.isnull(note_date):
        continue  # Skip if ContactDate is NaN
    
    time_window_left = note_date - pd.DateOffset(months=6)
    time_window_right = note_date + pd.DateOffset(months=6)
    
    df_subset = df_icd_combined[df_icd_combined['BDSPPatientID'] == patientid]
    df_subset = df_subset[(df_subset['ContactDate'] >= time_window_left) & (df_subset['ContactDate'] <= time_window_right)]
    
    for regex, col in zip(icd_codes_regex, icd_codes_col_names):
        matrix.loc[i, col] = int(any(df_subset['ICDCD'].str.contains(regex, case=False, regex=True)))

# Now 'matrix' should have the desired ICD columns populated based on your conditions
"""


'\nimport pandas as pd\n\n# Example function to parse datetime with multiple formats\ndef parse_datetime(date_str):\n    formats_to_try = ["%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]\n    \n    for fmt in formats_to_try:\n        try:\n            return pd.to_datetime(date_str, format=fmt)\n        except ValueError:\n            continue\n    \n    # If none of the formats work, you can handle the error or return None\n    return None\n\n# Assuming df_icd_combined is already defined as in your code\n# df_icd_combined = pd.concat([df_icd_bidmc, df_icd_mgb], axis=0, ignore_index=True)\n# df_icd_combined = df_icd_combined.rename(columns={\'ShiftedContactDTS\': \'ContactDate\'})\n\n# Convert ContactDate using the custom function\ndf_icd_combined[\'ContactDate\'] = df_icd_combined[\'ContactDate\'].apply(parse_datetime)\n\n# Now df_icd_combined[\'ContactDate\'] will contain datetime objects or NaN where parsing failed\n\n# Continue with your other operations\ndf_icd_combined[\

In [82]:
# df_icd_mgb = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/all_ICDs_MGB_3columns.csv")
# df_icd_bidmc = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/all_ICDs_BIDMC_3columns.csv")

# add ICD features separately for each code
# ICDs_complete = pd.concat([df_icd_bidmc, df_icd_mgb], axis=0, ignore_index=True)
ICDs_complete = ICDs_complete.rename(columns={'ICD Date': 'ContactDate'})
ICDs_complete['ContactDate'] = pd.to_datetime(ICDs_complete.ContactDate)
ICDs_complete['DiagnosisCodeWithDots'] = ICDs_complete.DiagnosisCodeWithDots.astype(str)

icd_codes = ['I62.0', 'S06.5', '432.1', '852.2', '852.3']
# a sensible way to combine them to reduce duplication is tocombine 852.2 and 852.3, and keep others separate
icd_codes_regex = [r'^I62\.0', r'^S06\.5', r'^432\.1', r'^(?:852\.2|852\.3)']
icd_codes_col_names = ['ICD_I62.0', 'ICD_S06.5', 'ICD_432.1', 'ICD_852.2or3']
for col in icd_codes_col_names:
    matrix[col] = np.nan

# for i in tqdm(range(len(matrix))):
#     patientid = matrix.BDSPPatientID.iloc[i]
#     note_date = matrix.ContactDate.iloc[i]
#     time_window_left = note_date - pd.DateOffset(months=6)
#     time_window_right = note_date + pd.DateOffset(months=6)
#     df_subset = ICDs_complete[ICDs_complete.BDSPPatientID==patientid]  # this step limits to this patient
#     df_subset = df_subset[(ICDs_complete.ContactDate>=time_window_left)&(ICDs_complete.ContactDate<=time_window_right)]  # this step limits to the time window
#     for regex, col in zip(icd_codes_regex, icd_codes_col_names):
#         matrix[col] = df_subset.apply(lambda row: 1 if row['DiagnosisCodeWithDots'].str.contains(regex, case=False, regex=True).any() else 0, axis=1)

for i in tqdm(range(len(matrix))):
    patientid = matrix.BDSPPatientID.iloc[i]
    note_date = matrix.ContactDate.iloc[i]
    time_window_left = note_date - pd.DateOffset(months=6)
    time_window_right = note_date + pd.DateOffset(months=6)
    df_subset = ICDs_complete[ICDs_complete.BDSPPatientID == patientid]
    df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
    for regex, col in zip(icd_codes_regex, icd_codes_col_names):
        matrix.loc[i, col] = (df_subset['DiagnosisCodeWithDots'].str.contains(regex, case=False, regex=True).any()).astype(int)



  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_subset = df_subset[(ICDs_complete.ContactDate >= time_window_left) & (ICDs_complete.ContactDate <= time_window_right)]
  df_sub

In [83]:
matrix.head()

Unnamed: 0,BDSPPatientID,ContactDate,NoteFileName,Site,CT,MRI,acut sdh_pos,brain injuri_pos,brain mri_pos,burr hole_pos,...,trauma_neg,prior sdh_neg,recent sdh_neg,resolv sdh_neg,known sdh_neg,history_sdh,ICD_I62.0,ICD_S06.5,ICD_432.1,ICD_852.2or3
0,120109726,2019-12-01,Notes_13393227243_2508134861_20191201.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,1.0,1.0,1.0,0.0
1,111971091,2021-05-30,Notes_13517098931_6064385669_20210530.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0.0,1.0,1.0,0.0
2,114651683,2022-03-09,Notes_13621620103_9363901699_20220309.txt,MGB,0,1,0,1,0,0,...,0,0,0,0,0,0,1.0,1.0,1.0,0.0
3,115288640,2021-03-02,Notes_13554067774_6841967671_20210302.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1.0,1.0,1.0,0.0
4,115389340,2019-12-02,Notes_13481394171_4791230696_20191202.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1.0,1.0,1.0,0.0


In [84]:
cols_to_convert = ['ICD_I62.0', 'ICD_S06.5', 'ICD_432.1', 'ICD_852.2or3']

for col in cols_to_convert:
    matrix[col] = matrix[col].astype(int)

matrix.head()

Unnamed: 0,BDSPPatientID,ContactDate,NoteFileName,Site,CT,MRI,acut sdh_pos,brain injuri_pos,brain mri_pos,burr hole_pos,...,trauma_neg,prior sdh_neg,recent sdh_neg,resolv sdh_neg,known sdh_neg,history_sdh,ICD_I62.0,ICD_S06.5,ICD_432.1,ICD_852.2or3
0,120109726,2019-12-01,Notes_13393227243_2508134861_20191201.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1,111971091,2021-05-30,Notes_13517098931_6064385669_20210530.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,114651683,2022-03-09,Notes_13621620103_9363901699_20220309.txt,MGB,0,1,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
3,115288640,2021-03-02,Notes_13554067774_6841967671_20210302.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
4,115389340,2019-12-02,Notes_13481394171_4791230696_20191202.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0


In [85]:
#Print a sample to ensure it worked
print(matrix.sample(20))

      BDSPPatientID ContactDate                               NoteFileName  \
2790      150685203  2018-03-25  Notes_1130543740_11494605478_20180325.txt   
2715      150950965  2016-06-18   Notes_1130809899_2121779999_20160618.txt   
107       112207248  2018-07-04  Notes_13316540243_1856285870_20180704.txt   
2         114651683  2022-03-09  Notes_13621620103_9363901699_20220309.txt   
61        120345747  2017-12-16  Notes_13395883170_1720844451_20171216.txt   
1050      120513180  2016-12-17  Notes_13281417410_1333671558_20161217.txt   
1434      120461581  2021-11-01  Notes_13595031196_5595449065_20211101.txt   
2252      151136744  2019-05-31    Notes_1130995807_408200447_20190531.txt   
501       115599431  2016-11-03  Notes_13390241096_1715651887_20161103.txt   
439       114577524  2022-06-04  Notes_13608547805_7616395601_20220604.txt   
79        119488118  2017-04-01  Notes_13362667479_1445720494_20170401.txt   
1398      112492385  2021-11-18  Notes_13462597606_5064251178_20

In [86]:
print(matrix)


      BDSPPatientID ContactDate                               NoteFileName  \
0         120109726  2019-12-01  Notes_13393227243_2508134861_20191201.txt   
1         111971091  2021-05-30  Notes_13517098931_6064385669_20210530.txt   
2         114651683  2022-03-09  Notes_13621620103_9363901699_20220309.txt   
3         115288640  2021-03-02  Notes_13554067774_6841967671_20210302.txt   
4         115389340  2019-12-02  Notes_13481394171_4791230696_20191202.txt   
...             ...         ...                                        ...   
2995      150057113  2018-09-21   Notes_1129915942_2473056145_20180921.txt   
2996      150740592  2010-10-10   Notes_1130599114_1436450624_20101010.txt   
2997      150616738  2019-08-06   Notes_1130475398_1086092807_20190806.txt   
2998      150044220  2016-12-04   Notes_1129902668_2977113246_20161204.txt   
2999      150028704  2020-11-04  Notes_1129887208_12495579047_20201104.txt   

       Site  CT  MRI  acut sdh_pos  brain injuri_pos  brain mri

In [87]:
# Convert final feature matrix including CPT, ICD, and features to a csv

matrix.to_csv('feature_matrix_notes_CPT_and_ICD_Draft_15.csv', index=False)
matrix.head()

Unnamed: 0,BDSPPatientID,ContactDate,NoteFileName,Site,CT,MRI,acut sdh_pos,brain injuri_pos,brain mri_pos,burr hole_pos,...,trauma_neg,prior sdh_neg,recent sdh_neg,resolv sdh_neg,known sdh_neg,history_sdh,ICD_I62.0,ICD_S06.5,ICD_432.1,ICD_852.2or3
0,120109726,2019-12-01,Notes_13393227243_2508134861_20191201.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1,111971091,2021-05-30,Notes_13517098931_6064385669_20210530.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,114651683,2022-03-09,Notes_13621620103_9363901699_20220309.txt,MGB,0,1,0,1,0,0,...,0,0,0,0,0,0,1,1,1,0
3,115288640,2021-03-02,Notes_13554067774_6841967671_20210302.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
4,115389340,2019-12-02,Notes_13481394171_4791230696_20191202.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0


In [88]:
#Purpose of this cell is to merge the newly formed matrix to the annotations
#Load in matrix
X = pd.read_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/feature_matrix_notes_CPT_and_ICD_Draft_15.csv')

# Load in Annotation csv
y_data_pre = pd.read_csv('/home/gregory178/Desktop/Annotation_Results/BIDMC_+_Minus_MGB_+_-.csv')
y = y_data_pre[['BDSPPatientID', 'annot','hospital']]

# Verify column names in X and y
print("Columns in X:", X.columns)
print("Columns in y:", y.columns)

# Merge X and y based on 'BDSPPatientID'
Xy = pd.merge(X, y, on='BDSPPatientID', how='inner', validate='1:1')
Xy.to_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/Complete_merged_feature_matrix_notes_CPT_and_ICD_Draft_15.csv')

Columns in X: Index(['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI',
       'acut sdh_pos', 'brain injuri_pos', 'brain mri_pos', 'burr hole_pos',
       'stabl sdh_pos', 'craniectomi_pos', 'craniotomi_pos', 'ct head_pos',
       'drainag_pos', 'evacu_pos', 'head ct_pos', 'hematoma_pos',
       'chronic sdh_pos', 'herniat_pos', 'intracrani hemorrhag_pos',
       'intraparenchym hemorrhag_pos', 'midlin shift_pos', 'mva_pos',
       'mvc_pos', 'neurosurgeri_pos', 'neurosurg intervent_pos', 'scan_pos',
       'sdh_pos', 'subdur_pos', 'tbi_pos', 'tentorium_pos', 'thick_pos',
       'trauma_pos', 'prior sdh_pos', 'recent sdh_pos', 'resolv sdh_pos',
       'known sdh_pos', 'acut sdh_neg', 'brain injuri_neg', 'brain mri_neg',
       'burr hole_neg', 'stabl sdh_neg', 'craniectomi_neg', 'craniotomi_neg',
       'ct head_neg', 'drainag_neg', 'evacu_neg', 'head ct_neg',
       'hematoma_neg', 'chronic sdh_neg', 'herniat_neg',
       'intracrani hemorrhag_neg', 'intraparenchym 

In [89]:
##Now we are going to split the data into training and testing data randomly

df = pd.read_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/Complete_merged_feature_matrix_notes_CPT_and_ICD_Draft_15.csv')

# Randomly Sample 1500 rows
sampled_df = df.sample(n=1500, random_state=13)

# Save randomly sampled data to CSV
sampled_df.to_csv('train_data_Draft_15.csv', index=False)

# Save remaining data to CSV as testing data
remaining_df = df.drop(sampled_df.index)
remaining_df.to_csv('test_data_Draft_15.csv', index=False)

#Verify that the two data sets do not share any BDSPPatientIDs
train_ids = set(sampled_df['BDSPPatientID'])
test_ids = set(remaining_df['BDSPPatientID'])

shared_ids = train_ids.intersection(test_ids)

if len(shared_ids) == 0:
    print("Verification passed: No common BDSPPatientID between train_data_Draft_15.csv and test_data_Draft_15.csv")
else:
    print("Verification failed: Common BDSPPatientID found between train_data_Draft_15.csv and test_data_Draft_15.csv")
    print("Shared IDs:", shared_ids)

Verification passed: No common BDSPPatientID between train_data_Draft_15.csv and test_data_Draft_15.csv


In [90]:
#Code to improve the feature matrix. We are going to see which features in the training data have less
#than 10 instances of 1 and/or -1 and remove them. This number was chosen as a benchmark to make sure that 
#the features used are relevant. 

features_before= sampled_df.loc[:, "CT":"ICD_852.2or3"]

features_before

total_counts = features_before.apply(lambda x: (x == 1).sum() + (x == -1).sum())

# Step 2: Determine if the total count of 1's and -1's is less than 20 for each column
for col, count in total_counts.items():
    if count < 10:
        print(f"For column '{col}', total count ({count}) of 1's and -1's is less than 10. Remove feature.")
    else:
        print(f"For column '{col}', total count ({count}) of 1's and -1's is 10 or more. Keep feature.")

For column 'CT', total count (746) of 1's and -1's is 10 or more. Keep feature.
For column 'MRI', total count (581) of 1's and -1's is 10 or more. Keep feature.
For column 'acut sdh_pos', total count (32) of 1's and -1's is 10 or more. Keep feature.
For column 'brain injuri_pos', total count (79) of 1's and -1's is 10 or more. Keep feature.
For column 'brain mri_pos', total count (75) of 1's and -1's is 10 or more. Keep feature.
For column 'burr hole_pos', total count (36) of 1's and -1's is 10 or more. Keep feature.
For column 'stabl sdh_pos', total count (20) of 1's and -1's is 10 or more. Keep feature.
For column 'craniectomi_pos', total count (37) of 1's and -1's is 10 or more. Keep feature.
For column 'craniotomi_pos', total count (189) of 1's and -1's is 10 or more. Keep feature.
For column 'ct head_pos', total count (319) of 1's and -1's is 10 or more. Keep feature.
For column 'drainag_pos', total count (177) of 1's and -1's is 10 or more. Keep feature.
For column 'evacu_pos', t

In [91]:
#Checking that the feature matrix is still aligned 
# File paths for your CSV files
file1 = '/home/gregory178/Desktop/NAX project/NAX_dive/train_data.csv'
file2 = '/home/gregory178/Desktop/NAX project/FM_Draft_15/train_data_Draft_15.csv'

# Column name to compare
column_to_compare = 'BDSPPatientID'

# Read CSV files into pandas DataFrames
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Extract the specific column from each DataFrame
column1 = df1[column_to_compare]
column2 = df2[column_to_compare]

# Check if the columns have the exact same values in the same order
if column1.equals(column2):
    print(f"The columns '{column_to_compare}' in both CSV files have the same values in the same order.")
else:
    print(f"The columns '{column_to_compare}' in both CSV files do not have the same values in the same order.")


The columns 'BDSPPatientID' in both CSV files have the same values in the same order.


In [92]:

# Verifying that the values are mutually exclusive

# Example: Assuming 'sdh_pos' and 'history_sdh' columns exist in feature_matrix

# 1. Count rows where both sdh_pos and history_sdh are 1
both_ones = df2[(df2['sdh_pos'] == 1) & (df2['history_sdh'] == 1)].shape[0]

# 2. Count rows where sdh_pos is 1 and history_sdh is 0
pos_one_neg_zero = df2[(df2['sdh_pos'] == 1) & (df2['history_sdh'] == 0)].shape[0]

# 3. Count rows where history_sdh is 1 and sdh_pos is 0
neg_one_pos_zero = df2[(df2['sdh_pos'] == 0) & (df2['history_sdh'] == 1)].shape[0]

# 4. Count rows where both sdh_pos and history_sdh are 0
both_zeros = df2[(df2['sdh_pos'] == 0) & (df2['history_sdh'] == 0)].shape[0]

# Print the results
print(f"1. Rows where sdh_pos and history_sdh are both 1: {both_ones}")
print(f"2. Rows where sdh_pos is 1 and history_sdh is 0: {pos_one_neg_zero}")
print(f"3. Rows where history_sdh is 1 and sdh_pos is 0: {neg_one_pos_zero}")
print(f"4. Rows where sdh_pos and history_sdh are both 0: {both_zeros}")

# #Previous before tightening neg_range 
# 1. Rows where sdh_pos and history_sdh are both 1: 45
# 2. Rows where sdh_pos is 1 and history_sdh is 0: 244
# 3. Rows where history_sdh is 1 and sdh_pos is 0: 110
# 4. Rows where sdh_pos and history_sdh are both 0: 1101


1. Rows where sdh_pos and history_sdh are both 1: 0
2. Rows where sdh_pos is 1 and history_sdh is 0: 442
3. Rows where history_sdh is 1 and sdh_pos is 0: 40
4. Rows where sdh_pos and history_sdh are both 0: 1018


In [93]:
#Original Output of Code above
# Features to remove:
# For column 'acute on chronic sdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'csdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'evacuation', total count (1) of 1's and -1's is less than 20. Remove feature.
# For column 'subacute to chronic sdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'subacute sdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'tsdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'acute sdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'spontaneous sdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'subdural haemorrhage', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'bicycle accident', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'tentorial', total count (2) of 1's and -1's is less than 20. Remove feature.
# For column 'imaging', total count (6) of 1's and -1's is less than 20. Remove feature.
# For column 'brain injury', total count (5) of 1's and -1's is less than 20. Remove feature.
# For column 'bilateral sdh', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'hematoma thickness', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'intraparenchymal hemorrhage', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'chronic sdh with acute', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'herniation', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'skull fracture', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'intracranial hemorrhage', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'hemorrhage', total count (4) of 1's and -1's is less than 20. Remove feature.
# For column 'head-strike', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'bleed in the brain', total count (1) of 1's and -1's is less than 20. Remove feature.
# For column 'motor vehicle accident', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'craniotomy', total count (1) of 1's and -1's is less than 20. Remove feature.
# For column 'bilateral', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'bike accident', total count (0) of 1's and -1's is less than 20. Remove feature.
# For column 'craniectomy', total count (1) of 1's and -1's is less than 20. Remove feature.
# For column 'subdural', total count (9) of 1's and -1's is less than 20. Remove feature.
# For column 'asdh', total count (0) of 1's and -1's is less than 20. Remove feature.


# Features to keep:
# For column 'CT', total count (746) of 1's and -1's is 20 or more. Keep feature.
# For column 'MRI', total count (581) of 1's and -1's is 20 or more. Keep feature.
# For column 'mri brain', total count (133) of 1's and -1's is 20 or more. Keep feature.
# For column 'tentorium', total count (22) of 1's and -1's is 20 or more. Keep feature.
# For column 'sdh', total count (484) of 1's and -1's is 20 or more. Keep feature.
# For column 'tbi', total count (99) of 1's and -1's is 20 or more. Keep feature.
# For column 'burr hole', total count (36) of 1's and -1's is 20 or more. Keep feature.
# For column 'head ct', total count (281) of 1's and -1's is 20 or more. Keep feature.
# For column 'mvc', total count (22) of 1's and -1's is 20 or more. Keep feature.
# For column 'head strike', total count (64) of 1's and -1's is 20 or more. Keep feature.
# For column 'hematoma', total count (284) of 1's and -1's is 20 or more. Keep feature.
# For column 'fall', total count (335) of 1's and -1's is 20 or more. Keep feature.
# For column 'r sdh', total count (81) of 1's and -1's is 20 or more. Keep feature.
# For column 'brain mri', total count (77) of 1's and -1's is 20 or more. Keep feature.
# For column 'fell', total count (132) of 1's and -1's is 20 or more. Keep feature.
# For column 'l sdh', total count (76) of 1's and -1's is 20 or more. Keep feature.
# For column 'ct head', total count (336) of 1's and -1's is 20 or more. Keep feature.
# For column 'scan', total count (346) of 1's and -1's is 20 or more. Keep feature.
# For column 'ICD_I62.0', total count (234) of 1's and -1's is 20 or more. Keep feature.
# For column 'ICD_S06.5', total count (425) of 1's and -1's is 20 or more. Keep feature.
# For column 'ICD_432.1', total count (380) of 1's and -1's is 20 or more. Keep feature.
# For column 'ICD_852.2or3', total count (91) of 1's and -1's is 20 or more. Keep feature
# For column 'brain bleed', total count (13) of 1's and -1's is less than 20. Remove feature.