In [15]:
#Import all necessary modules. 
#Optional steps 1-5 can be used to make new sampling cohorts, however, you must annually annotate the results from steps 3 and 4 by putting the file paths in the annotation tool
#and checking if SDH is present in the patient notes using key words. 
#Steps 1-25 can be used to replicate this data. Just click play and run them in order. This folder should have all the files needed to replicate this data. 
#If part of the Prophet project, please skip steps 6, 13, 14, 18, 19, 20, and 21 for the data. 

#Import necessary modules
import pandas as pd
import re
import nltk
from nltk.stem import SnowballStemmer

In [16]:
#Load in Positive and Negative Sampling Cohorts for MGB and BIDMC
MGB_plus=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/MGB_sampling_cohort_ICD+_discharge_notes.csv")
MGB_minus=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/MGB_sampling_cohort_ICD_minus_discharge_notes.csv")
BI_plus=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/BI_sampling_cohort_ICD+_initial_notes.csv")
BI_minus=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/BI_sampling_cohort_ICD_minus_initial_notes.csv")

In [17]:
#Concatenate the BIDMC+/- and MGB+/- notes together. 
#Make MGB and BIDMC seperate for CPT codes because MGB and BIDMC data work differently for it. 
MGB_Complete_Notes = pd.concat([MGB_plus,MGB_minus], ignore_index=True)
BIDMC_Complete_Notes= pd.concat([BI_plus,BI_minus], ignore_index=True)

MGB_Complete_Notes.to_csv("MGB_Complete_Notes.csv", index=False)
BIDMC_Complete_Notes.to_csv("BIDMC_Complete_Notes.csv", index=False)

In [18]:
#Verifying it concatenated correctly.
# MGB
df=pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/MGB_Complete_Notes.csv')
print(len(df))
print(df.columns)
print(df)

# BIDMC
df1=pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/BIDMC_Complete_Notes.csv')
print(len(df1))
print(df1.columns)
print(df1)

1500
Index(['BDSPPatientID', 'NoteContent', 'ContactDate', 'WordCount',
       'NoteFileName', 'Site'],
      dtype='object')
      BDSPPatientID                                        NoteContent  \
0         120109726    Physician ***** *****       Admit date: ****...   
1         111971091    Physician ***** *****       Admit date: ****...   
2         114651683    Physician ***** *****       Admit date: ****...   
3         115288640    Physician ***** *****       Admit date: ****...   
4         115389340    Physician ***** *****       Admit date: ****...   
...             ...                                                ...   
1495      116101588    Physician ***** *****       Admit date: ****...   
1496      111193161   Note *****: Detach Note + Clinical Encounter ...   
1497      117924962    Physician ***** *****       Admit date: ****...   
1498      121552284    Physician ***** *****       Admit date: ****...   
1499      122349255    Physician ***** *****       Admit dat

In [20]:
#Adding CPT Columns to the feature matrix.
#We will start with MGB CPT Codes. Because there was a CSV for CPT MGB, we will use it to find specific CPT codes for Brain MRI and Head CT.
#Read in the CPT data from MGB. 
CPTs = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/patientIDs_CPT_HeadMRICT_MGB.csv')

#Filter CPT Patient Ids to only those in your matrix.
CPTs = CPTs[CPTs['BDSPPatientID'].isin(set(MGB_Complete_Notes['BDSPPatientID']))]

#Convert 'StartDTS' to datetime.
CPTs['CPT Date'] = pd.to_datetime(CPTs['StartDTS'], errors='coerce')

#Select relevant columns from CPT data.
CPTs = CPTs[['BDSPPatientID', 'CPT Date', 'CPT']]

#Define function that checks if cpt date and contact date are within 6 months of each other.
def within_six_months(contact_date, cpt_date):
    return (contact_date >= cpt_date - pd.DateOffset(months=6)) & (contact_date <= cpt_date + pd.DateOffset(months=6))


#Initialize columns in matrix for MRI and CT presence. These columns will both be features in the feature matrix.
MGB_Complete_Notes['MRI'] = 0
MGB_Complete_Notes['CT'] = 0

#Convert ContactDate to datetime.
MGB_Complete_Notes['ContactDate'] = pd.to_datetime(MGB_Complete_Notes['ContactDate'], errors='coerce')

#Iterate over each patient in matrix.
for index, row in MGB_Complete_Notes.iterrows():
    patient_id = row['BDSPPatientID']
    contact_date = row['ContactDate']

    #Check for MRI CPTs
    for _, cpt_row in CPTs[(CPTs['BDSPPatientID'] == patient_id) & (CPTs['CPT'].isin([70450, 70460, 70470]))].iterrows():
        cpt_date = cpt_row['CPT Date']
        if pd.notnull(contact_date) and pd.notnull(cpt_date) and within_six_months(contact_date, cpt_date):
            MGB_Complete_Notes.at[index, 'MRI'] = 1
            #Once found, no need to check further.
            break 

    #Check for CT CPTs
    for _, cpt_row in CPTs[(CPTs['BDSPPatientID'] == patient_id) & (CPTs['CPT'].isin([70551, 70552, 70553]))].iterrows():
        cpt_date = cpt_row['CPT Date']
        if pd.notnull(contact_date) and pd.notnull(cpt_date) and within_six_months(contact_date, cpt_date):
            MGB_Complete_Notes.at[index, 'CT'] = 1
            #Once found, no need to check further. 
            break  

#Convert resulting MGB_Complete_Notes with updated CT and MRI columns to CSV
MGB_Complete_Notes.to_csv('MGB_CPT_.csv', index=False)

#We will now make BIDMC CPT columns. 
# For each patient use regex to search for cpt within note text of matrix to fill in matrix value instead of using cpt codes due to lack of available info on BIDMC.
# This code will allow us to assign a 1 to the patient if it contains the regex word or a 0 if it does not. 
BIDMC_Complete_Notes['MRI'] = BIDMC_Complete_Notes['NoteContent'].str.contains(r'(?:brain mri|mri brain|head mri|mri head)', regex=True, case=False, na=False).astype(int)
BIDMC_Complete_Notes['CT'] = BIDMC_Complete_Notes['NoteContent'].str.contains(r'(?:ct head|head ct|brain ct|ct brain|hct)', regex=True, case=False, na=False).astype(int)
BIDMC_Complete_Notes.to_csv('BIDMC_CPT_.csv', index=False)

#Assigning df and df1 variables to MGB and BIDMC
df = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/MGB_CPT_.csv")
df1 = BIDMC_Complete_Notes

#Making sure that the CPT code created the MRI and CT columns.
df1.head()

Unnamed: 0,BDSPPatientID,NoteContent,ContactDate,WordCount,NoteFileName,Site,MRI,CT
0,150683141,\n\nNote Date: *****/*****/*****\n\nNote Type:...,2016-06-01,1227,Notes_1130542094_11409543435_20160601.txt,BIDMC,0,0
1,150015356,\n\nNote Date: *****/*****/*****\n\nNote Type:...,2017-04-07,1663,Notes_1129874480_3082345375_20170407.txt,BIDMC,0,1
2,150000686,\n\nNote Date: *****/*****/*****\n\nNote Type:...,2020-01-11,912,Notes_1129859284_277121157_20200111.txt,BIDMC,0,0
3,150000451,\n\nNote Date: *****/*****/*****\n\nNote Type:...,2016-01-22,1683,Notes_1129859570_238148159_20160122.txt,BIDMC,0,1
4,150014747,\n\nNote Date: *****/*****/*****\n\nNote Type:...,2016-06-25,1276,Notes_1129873727_3024909815_20160625.txt,BIDMC,0,1


In [21]:
##Here we are creating the feature matrix using key words for MGB

#Importing necessary modules
from tqdm.notebook import tqdm
tqdm.pandas()

nltk.download('punkt')

#Initialize the Snowball Stemmer for English
stemmer = SnowballStemmer("english")

#Define keywords and phrases. Put in your own phrases as needed as long as they are stemmed. 
#Keywords before stemming
#  "acute sdh", "brain injury", "brain mri",
    # "burr hole", "chronic sdh", "craniectomy", "craniotomy", "ct head", "drainage", "epidural hemorrhage",
    # "evacuation", "head ct", "hematoma",
    # "herniation", "imaging", "intracranial hemorrhage", "intraparenchymal hemorrhage",
    # "midline shift", "mri brain", "mva", "mvc", "neurosurgery", "neurosurgical intervention",
    # "scan", "sdh", "subdural", "tbi", "tentorium", "thickness", "trauma"

#keywords below after stemming
keywords = [
    "acut sdh", "brain injuri", "brain mri",
    "burr hole", "stabl sdh", "craniectomi", "craniotomi", "ct head", "drainag",
    "evacu", "head ct", "hematoma","chronic sdh",
    "herniat", "intracrani hemorrhag", "intraparenchym hemorrhag",
    "midlin shift", "mva", "mvc", "neurosurgeri", "neurosurg intervent",
    "scan", "sdh", "subdur", "tbi", "tentorium", "thick", "trauma", "prior sdh", "recent sdh", "resolv sdh",
    "known sdh"
]
 
# Function to preprocess text (lowercase, stemming using NLTK)
def preprocess_text(text):
    #Convert text to lowercase
    text = text.lower()  
    # Replace specific terms as needed to SDH.
    text = re.sub(r"\bsubdural (hematoma|hemorrhage)\b", "sdh", text)   
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    #Getting rid of extra spacing. 
    return ' '.join(stemmed_tokens) 


# Preprocess the 'NoteContent' column to make usre all text is lowered and has one space in between.
df['NoteContent'] = df['NoteContent'].apply(preprocess_text)

#history pattern. Only want to apply to sdh. 
history_patterns = {}
sdh_pattern = r"\b(?:histori\b|histori:\b|hx\b|pth\b|pmh\b)[^,.]{0,20}\bsd(?:h)?\b"
history_patterns["sdh"] = sdh_pattern


# generate neg and pos patterns
pos_patterns = {}
for phrase in keywords:
    #stemmed_phrase = stem_phrase(phrase, stemmer)
    words = phrase.split()
    pos_patterns[phrase] = r"\b" + r"".join([f"{word}\s*" for word in words]) +r"\b"

# Adding negator words
neg_patterns = {}
for phrase in keywords:
    #stemmed_phrase = stem_phrase(phrase, stemmer)  # no need since the keywords are already in basic form
    words = phrase.split()
    neg_patterns[phrase] = r"\b" + r"(?:(?:negat|no\b|not\b)[^,.]{{0,20}}{pattern})|(?:{pattern}[^.]{{0,10}}(?:absent\b|negat))".format(
        pattern= r''.join([f"{word}\s*" for word in words]) +r"\b" )


def check_patterns(text, neg_patterns, pos_patterns, history_patterns):
    matches = {f"{key}_pos": 0 for key in pos_patterns}
    matches.update({f"{key}_neg": 0 for key in neg_patterns})
    
    # Check if sdh falls under history_patterns
    sdh_history_matches = [(m.start(), m.end()) for m in re.finditer(history_patterns["sdh"], text)]
    if sdh_history_matches:
        matches["history_sdh"] = 1
    else:
        matches["history_sdh"] = 0

    # Step 1: Check for negative patterns first
    for key in neg_patterns:
        neg_matches = [(m.start(), m.end()) for m in re.finditer(neg_patterns[key], text)]
        if neg_matches:
            matches[f"{key}_neg"] = 1

    # Step 2: Check for positive patterns that are not within any negative pattern
    for key_pos in pos_patterns:
        pos_matches = [(m.start(), m.end()) for m in re.finditer(pos_patterns[key_pos], text)]
        if pos_matches:
            # Check if positive match is within history_patterns["sdh"]
            pos_within_sdh_history = False
            for start, end in pos_matches:
                for h_start, h_end in sdh_history_matches:
                    if h_start <= start <= h_end or h_start <= end <= h_end:
                        pos_within_sdh_history = True
                        break
                if pos_within_sdh_history:
                    break
            if not pos_within_sdh_history:
                # Check if positive match is not within any negative pattern
                is_positive = True
                for start, end in pos_matches:
                    for key_neg in neg_patterns:
                        neg_matches = [(m.start(), m.end()) for m in re.finditer(neg_patterns[key_neg], text)]
                        for n_start, n_end in neg_matches:
                            if n_start <= start <= n_end or n_start <= end <= n_end:
                                is_positive = False
                                break
                        if not is_positive:
                            break
                    if is_positive:
                        matches[f"{key_pos}_pos"] = 1
                        break

    return matches


# Applying all of the functions above
feature_matrix = df['NoteContent'].apply(lambda note_text: pd.Series(check_patterns(note_text, neg_patterns, pos_patterns, history_patterns)))
result_df = pd.concat([df[['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI']], feature_matrix], axis=1)

# Save result to CSV file
result_df.to_csv('feature_matrix_MGB_.csv', index=False)

[nltk_data] Downloading package punkt to /home/gregory178/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
## Here we are creating the feature matrix using key words for BIDMC

# Importing necessary modules
from tqdm.notebook import tqdm
tqdm.pandas()

nltk.download('punkt')

# Initialize the Snowball Stemmer for English
stemmer = SnowballStemmer("english")

# Define keywords and phrases
keywords = [
    "acut sdh", "brain injuri", "brain mri",
    "burr hole", "stabl sdh", "craniectomi", "craniotomi", "ct head", "drainag",
    "evacu", "head ct", "hematoma","chronic sdh",
    "herniat", "intracrani hemorrhag", "intraparenchym hemorrhag",
    "midlin shift", "mva", "mvc", "neurosurgeri", "neurosurg intervent",
    "scan", "sdh", "subdur", "tbi", "tentorium", "thick", "trauma", "prior sdh", "recent sdh", "resolv sdh",
    "known sdh"
]

# Function to preprocess text (lowercase, stemming using NLTK)
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    # Replace specific terms as needed
    text = re.sub(r"\bsubdural (hematoma|hemorrhage)\b", "sdh", text)  # Example replacement
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


# Preprocess the 'NoteContent' column
df1['NoteContent'] = df1['NoteContent'].apply(preprocess_text)

#history pattern. Only want to apply to sdh. 
history_patterns = {}
sdh_pattern = r"\b(?:histori\b|histori:\b|hx\b|pth\b|pmh\b)[^,.]{0,20}\bsd(?:h)?\b"
history_patterns["sdh"] = sdh_pattern


# generate neg and pos patterns
pos_patterns = {}
for phrase in keywords:
    #stemmed_phrase = stem_phrase(phrase, stemmer)
    words = phrase.split()
    pos_patterns[phrase] = r"\b" + r"".join([f"{word}\s*" for word in words]) +r"\b"

# Adding negator words
neg_patterns = {}
for phrase in keywords:
    #stemmed_phrase = stem_phrase(phrase, stemmer)  # no need since the keywords are already in basic form
    words = phrase.split()
    neg_patterns[phrase] = r"\b" + r"(?:(?:negat|no\b|not\b)[^,.]{{0,20}}{pattern})|(?:{pattern}[^.]{{0,10}}(?:absent\b|negat))".format(
        pattern= r''.join([f"{word}\s*" for word in words]) +r"\b" )



def check_patterns(text, neg_patterns, pos_patterns, history_patterns):
    matches = {f"{key}_pos": 0 for key in pos_patterns}
    matches.update({f"{key}_neg": 0 for key in neg_patterns})
    
    # Check if sdh falls under history_patterns
    sdh_history_matches = [(m.start(), m.end()) for m in re.finditer(history_patterns["sdh"], text)]
    if sdh_history_matches:
        matches["history_sdh"] = 1
    else:
        matches["history_sdh"] = 0

    # Step 1: Check for negative patterns first
    for key in neg_patterns:
        neg_matches = [(m.start(), m.end()) for m in re.finditer(neg_patterns[key], text)]
        if neg_matches:
            matches[f"{key}_neg"] = 1

    # Step 2: Check for positive patterns that are not within any negative pattern
    for key_pos in pos_patterns:
        pos_matches = [(m.start(), m.end()) for m in re.finditer(pos_patterns[key_pos], text)]
        if pos_matches:
            # Check if positive match is within history_patterns["sdh"]
            pos_within_sdh_history = False
            for start, end in pos_matches:
                for h_start, h_end in sdh_history_matches:
                    if h_start <= start <= h_end or h_start <= end <= h_end:
                        pos_within_sdh_history = True
                        break
                if pos_within_sdh_history:
                    break
            if not pos_within_sdh_history:
                # Check if positive match is not within any negative pattern
                is_positive = True
                for start, end in pos_matches:
                    for key_neg in neg_patterns:
                        neg_matches = [(m.start(), m.end()) for m in re.finditer(neg_patterns[key_neg], text)]
                        for n_start, n_end in neg_matches:
                            if n_start <= start <= n_end or n_start <= end <= n_end:
                                is_positive = False
                                break
                        if not is_positive:
                            break
                    if is_positive:
                        matches[f"{key_pos}_pos"] = 1
                        break

    return matches




# Applying functions above to make the feature matrix
feature_matrix = df1['NoteContent'].apply(lambda note_text: pd.Series(check_patterns(note_text, neg_patterns, pos_patterns, history_patterns)))
result_df1 = pd.concat([df1[['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI']], feature_matrix], axis=1)
#Need to add MRI and CT here 

# Save result to CSV file
result_df1.to_csv('feature_matrix_BIDMC_.csv', index=False)




[nltk_data] Downloading package punkt to /home/gregory178/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
#Print columns. 
print(result_df.columns)
print(result_df1.columns)
print(result_df.columns==result_df1.columns)

Index(['BDSPPatientID', 'ContactDate', 'NoteFileName', 'Site', 'CT', 'MRI',
       'acut sdh_pos', 'brain injuri_pos', 'brain mri_pos', 'burr hole_pos',
       'stabl sdh_pos', 'craniectomi_pos', 'craniotomi_pos', 'ct head_pos',
       'drainag_pos', 'evacu_pos', 'head ct_pos', 'hematoma_pos',
       'chronic sdh_pos', 'herniat_pos', 'intracrani hemorrhag_pos',
       'intraparenchym hemorrhag_pos', 'midlin shift_pos', 'mva_pos',
       'mvc_pos', 'neurosurgeri_pos', 'neurosurg intervent_pos', 'scan_pos',
       'sdh_pos', 'subdur_pos', 'tbi_pos', 'tentorium_pos', 'thick_pos',
       'trauma_pos', 'prior sdh_pos', 'recent sdh_pos', 'resolv sdh_pos',
       'known sdh_pos', 'acut sdh_neg', 'brain injuri_neg', 'brain mri_neg',
       'burr hole_neg', 'stabl sdh_neg', 'craniectomi_neg', 'craniotomi_neg',
       'ct head_neg', 'drainag_neg', 'evacu_neg', 'head ct_neg',
       'hematoma_neg', 'chronic sdh_neg', 'herniat_neg',
       'intracrani hemorrhag_neg', 'intraparenchym hemorrhag_neg'

In [24]:
# Combine the MGB and BIDMC feature matrixes with CPT codes. 
MGB_BIDMC_CPT_Feature_Matrix= pd.concat([result_df,result_df1], axis=0, ignore_index=True)
MGB_BIDMC_CPT_Feature_Matrix.to_csv("MGB_BIDMC_CPT_Feature_Matrix_.csv", index=False)
MGB_BIDMC_CPT_Feature_Matrix.head()

Unnamed: 0,BDSPPatientID,ContactDate,NoteFileName,Site,CT,MRI,acut sdh_pos,brain injuri_pos,brain mri_pos,burr hole_pos,...,subdur_neg,tbi_neg,tentorium_neg,thick_neg,trauma_neg,prior sdh_neg,recent sdh_neg,resolv sdh_neg,known sdh_neg,history_sdh
0,120109726,2019-12-01,Notes_13393227243_2508134861_20191201.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,111971091,2021-05-30,Notes_13517098931_6064385669_20210530.txt,MGB,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,114651683,2022-03-09,Notes_13621620103_9363901699_20220309.txt,MGB,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,115288640,2021-03-02,Notes_13554067774_6841967671_20210302.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,115389340,2019-12-02,Notes_13481394171_4791230696_20191202.txt,MGB,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
