In [24]:
import pandas as pd
import sys
import os
selected_categories = ["Nursing", "Nursing/other"]
# Add parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import importlib
import utils.event_extractor  

importlib.reload(utils.event_extractor)
from utils.event_extractor import EventExtractor  

import pandas as pd
import numpy as np

def get_quartiles(df, column):
    """
    Calculate the quartiles (Q1, Q2, Q3) for a given column in a DataFrame and return the quartile values along with the subset of rows between Q1 and Q2.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        column (str): The column name for which to calculate quartiles.

    Returns:
        tuple: (q1, q2, q3, mid_df)
            q1 (float): 25th percentile value.
            q2 (float): Median (50th percentile) value.
            q3 (float): 75th percentile value.
            mid_df (pd.DataFrame): Subset of df where column values are between q1 and q2 (inclusive).
    """
    q1 = df[column].quantile(0.25)
    q2 = df[column].median()
    q3 = df[column].quantile(0.75)
    mid_df = df[(df[column]>=q1)&(df[column]<=q2)]
    num = len(mid_df)
    print(f"{column} Quartiles :")
    print(f"Q1 (25th percentile): {q1:.2f}")
    print(f"Q2 (Median):          {q2:.2f}")
    print(f"Q3 (75th percentile): {q3:.2f}")
    print(f"Number of rows in this range q1-q3: {num}")
    return q1,q2,q3,mid_df


In [25]:


# Load the admissions data
admissions = pd.read_pickle("../data/ADMISSIONS.pkl")

# Calculate Length of Stay in days
admissions["LOS_DAYS"] = (admissions["DISCHTIME"] - admissions["ADMITTIME"]).dt.total_seconds() / (24 * 3600)

# stay_q1,stay_q2,stay_q3 = get_quartiles(admissions, "LOS_DAYS")





Valid, alive patients are those who have a valide DOB and do not have a DOD associated to them.



In [26]:

# Load the PATIENTS.pkl file
patients = pd.read_pickle("../data/PATIENTS.pkl")
notes = pd.read_pickle("../data/NOTEEVENTS.pkl")
notes['HADM_ID'] = notes['HADM_ID'].fillna(0).astype(int)
date_cols = ['DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN']
patients[date_cols] = patients[date_cols].apply(pd.to_datetime, errors='coerce')

# Filter patients with no recorded death
alive_patients = patients[
    patients['DOD'].isna() &
    patients['DOD_HOSP'].isna() &
    patients['DOD_SSN'].isna()
]

print(f"Number of patients not known to have died: {len(alive_patients)}")
print(alive_patients.head())


valid_alive_patients = alive_patients
valid_alive_patients['DOB'].min()


Number of patients not known to have died: 30761
   ROW_ID  SUBJECT_ID GENDER        DOB DOD DOD_HOSP DOD_SSN  EXPIRE_FLAG
0     234         249      F 2075-03-13 NaT      NaT     NaT            0
2     236         251      M 2090-03-15 NaT      NaT     NaT            0
3     237         252      M 2078-03-06 NaT      NaT     NaT            0
4     238         253      F 2089-11-26 NaT      NaT     NaT            0
5     239         255      M 2109-08-05 NaT      NaT     NaT            0


Timestamp('1800-07-16 00:00:00')

In [27]:
from datetime import datetime
import numpy as np

def calculate_age(row):
    admit = row['ADMITTIME'].to_pydatetime()
    dob = row['DOB'].to_pydatetime()
    return (admit - dob).days / 365.25

subject_id_to_dob = {i:j for (i,j) in zip(valid_alive_patients["SUBJECT_ID"], valid_alive_patients["DOB"])}




grouped_notes = notes[notes.CATEGORY.isin(selected_categories)].groupby(['HADM_ID']).count()['TEXT'].reset_index().rename(columns={'TEXT': 'COUNT_TEXT'})
hadm_to_num_report = {i:j for (i,j) in zip(grouped_notes["HADM_ID"], grouped_notes["COUNT_TEXT"])}



admissions['DOB'] = admissions['SUBJECT_ID'].apply(lambda x: subject_id_to_dob.get(x, np.nan))
admissions['AGE'] = admissions.apply(calculate_age, axis=1)
admissions['NUM_REPORTS'] = admissions['HADM_ID'].apply(lambda x:hadm_to_num_report.get(x, np.nan))
admissions = admissions[admissions.AGE<200]


admissions.dropna(subset=["DOB","NUM_REPORTS"],inplace=True)



In [28]:
los_quartiles = get_quartiles(admissions, "LOS_DAYS")
n_report_quartiles = get_quartiles(admissions, "NUM_REPORTS")
age_quartiles = get_quartiles(admissions,"AGE")


LOS_DAYS Quartiles :
Q1 (25th percentile): 3.39
Q2 (Median):          5.81
Q3 (75th percentile): 10.72
Number of rows in this range q1-q3: 6430
NUM_REPORTS Quartiles :
Q1 (25th percentile): 3.00
Q2 (Median):          7.00
Q3 (75th percentile): 18.00
Number of rows in this range q1-q3: 9068
AGE Quartiles :
Q1 (25th percentile): 0.00
Q2 (Median):          48.07
Q3 (75th percentile): 64.72
Number of rows in this range q1-q3: 12856


In [29]:
common_hadm = set(los_quartiles[-1]["HADM_ID"].tolist()).intersection(n_report_quartiles[-1]["HADM_ID"].tolist()).intersection(age_quartiles[-1]["HADM_ID"].to_list())
len(common_hadm)

1332

In [None]:
import os
import spacy

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
import utils.nlp_tools as nlp_tools
nlp_tools = importlib.reload(nlp_tools)
nlp = nlp_tools.TextLib("en_core_web_lg")

from resources.abbreviations import abbreviation_dict

# Load the English model (download if you haven't: python -m spacy download en_core_web_lg)


known_dict = {}
def extract_sentences(text):

    global known_dict
    if text in known_dict:
        return known_dict[text]
    else:
        text = nlp.replace_abbreviations(text,abbreviation_dict)
        text = nlp.remove_error_strings(text)
        sentences_raw = nlp.sentence_splitter(text,span=False)
        sentences = [sent['text'] for sent in sentences_raw]
        # headers = [sent['headers'] for sent in sentences_raw]
        known_dict[text] = sentences
    return sentences 

filtered_reports_df = notes[(notes["HADM_ID"].isin(common_hadm)) & (notes["CATEGORY"].isin(selected_categories))]
print(extract_sentences(filtered_reports_df['TEXT'].iloc[0]))
filtered_reports_df['Sentences'] = filtered_reports_df['TEXT'].apply(extract_sentences)


['22 yr old with type 1 iddm  since age 6. patient admitted with n/v and glu\n   345, DKA.', 'Tx in ew with ns , zolfran, morphine and iv insulin.', 'Hx\n   depression, high chol, gastritis with dka admit on [**2-27**].', '', ' iv insulin stopped 10 am with sc glargine at 0930,  fs on iv in\n   80\ns  now 130-160, no nausea , vomited at 0800 small amt approx 20 cc.\n   tolerating fluids and jello', ' increased coverage and extra dose glargine and humalog at 1630.', 'labs drawn at a630, iv fluids off at 1500', ' patient requesting solid supper and no n/v see 1630 labs', ' cont qid fs and coverage with SSI, give antiemetics.', 'Next labs\n   [**2132**]', '', ' abd pain', '[**7-9**] and patient sleeping most of day after iv morphine\n   administered.', 'patient was on iv morphine at last admit and then oxycodone\n   psot hospitalizartion.', 'patient stated that he did not have any more\n   oxycodone at home .', 'requesting pain med q 4-5 hr.', 'hr 95-105 when c/o\n   pain.', ' iv morphine

In [19]:
import os
os.makedirs("../exports", exist_ok=True)
filtered_reports_df.to_pickle("../exports/filtered_patient_reports.pkl")

['22 yr old with type 1 iddm  since age 6. patient admitted with n/v and glu\n   345, DKA.',
 'Tx in ew with ns , zolfran, morphine and iv insulin.',
 'Hx\n   depression, high chol, gastritis with dka admit on [**2-27**].',
 '',
 ' iv insulin stopped 10 am with sc glargine at 0930,  fs on iv in\n   80\ns  now 130-160, no nausea , vomited at 0800 small amt approx 20 cc.\n   tolerating fluids and jello',
 ' increased coverage and extra dose glargine and humalog at 1630.',
 'labs drawn at a630, iv fluids off at 1500',
 ' patient requesting solid supper and no n/v see 1630 labs',
 ' cont qid fs and coverage with SSI, give antiemetics.',
 'Next labs\n   [**2132**]',
 '',
 ' abd pain',
 '[**7-9**] and patient sleeping most of day after iv morphine\n   administered.',
 'patient was on iv morphine at last admit and then oxycodone\n   psot hospitalizartion.',
 'patient stated that he did not have any more\n   oxycodone at home .',
 'requesting pain med q 4-5 hr.',
 'hr 95-105 when c/o\n   pain.