In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np
from datasets import Dataset, DatasetDict

csv_path = "/Users/mayasachidanand/Downloads/cs4120/project/mimic-iv-ext-bhc-labeled-clinical-notes-dataset-for-hospital-course-summarization-1.2.0/mimic-iv-bhc.csv"

# creating a 10% sample of the dataset due to size and memory constraints
total_rows = sum(1 for _ in open(csv_path)) - 1
print("Total Rows:", total_rows)
sample_frac = 0.1  # 10%
sample_n = int(total_rows * sample_frac)

# randomly skip rows
skip_idx = sorted(np.random.choice(np.arange(1, total_rows + 1), total_rows - sample_n, replace=False))
df = pd.read_csv(csv_path, skiprows=skip_idx)

print(df.info())
print(df.head())

Total Rows: 270033
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27003 entries, 0 to 27002
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   note_id        27003 non-null  object
 1   input          27003 non-null  object
 2   target         27003 non-null  object
 3   input_tokens   27003 non-null  int64 
 4   target_tokens  27003 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.0+ MB
None
          note_id  ... target_tokens
0  10000248-DS-10  ...           230
1  10000980-DS-21  ...           651
2  10001401-DS-19  ...           698
3  10001884-DS-33  ...           232
4   10002155-DS-9  ...           428

[5 rows x 5 columns]


In [12]:
# remove unnecessary characters 
def clean_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ') # remove newline characters
    text = re.sub(r'\[\*\*.*?\*\*\]', '', text) # remove PHI markers
    text = re.sub(r'[-=]{2,}', ' ', text) # remove repeated punctuation of -, =, maintain _ for deidentified information
    text = re.sub(r'\s+', ' ', text) # remove extra spaces
    return text

# testing cleaning of data
example_row = df.iloc[1]
cleaned_input = clean_text(example_row["input"])
cleaned_output = clean_text(example_row["target"])
print("Original Input:", example_row["input"])
print("Cleaned Input:", cleaned_input)

# clean texts based on above criteria
df["body"] = df["input"].apply(clean_text)
df["summary"] = df["target"].apply(clean_text)

Original Input: <SEX> F <SERVICE> MEDICINE <ALLERGIES> No Known Allergies / Adverse Drug Reactions <ATTENDING> ___. <CHIEF COMPLAINT> dyspnea <MAJOR SURGICAL OR INVASIVE PROCEDURE> Cardiac catheterization ___ <HISTORY OF PRESENT ILLNESS> This is a ___ M with history of diabetes, diastolic CHF, hypertension, ?CAD, peripheral vascular disease, CKD presenting with ___ days of increasing dyspnea and non-productive cough. She denies fevers, chills, chest pain, nausea, vomiting. Did report feeling somewhat wheezy. Denies leg swelling, has possibly had a 2 lb weight gain. Denies missed medication doses. Does report 2 pillow orthopnea last night. Her husband has also been sick with a cough for the past day or so. She is a non-smoker. She lives at home and has had no recent hospitalizations or courses of antibiotics. In the ED, initial vitals: 97.8 80 132/83 25 97% ra. CXR showed probable RUL PNA. Normal WBC and lactate, Cr at baseline. Troponin 0.09 with normal CK-MB. BNP 2826. She was started

In [13]:
# calculate the mean length and shortest length text
def text_lengths(text):
    mean = text.str.len().mean()
    shortest = text.str.len().min()
    return mean, shortest

# explore the lengths of the texts for the body and summary columns
mean_body, shortest_body = text_lengths(df["body"])
print("Mean Body Length:", mean_body)
print("Shortest Body Length:", shortest_body)

mean_summary, shortest_summary = text_lengths(df["summary"])
print("Mean Summary Length:", mean_summary)
print("Shortest Summary Length:", shortest_summary)

# filter out bad note/summary examples (notes less than 1300 characters & summaries less than 300 characters)
df = df[(df["body"].str.len() > 1300) & (df["summary"].str.len() > 300)]
df_ready = df[["body", "summary"]].dropna()

Mean Body Length: 7629.290930637337
Shortest Body Length: 293
Mean Summary Length: 2423.122060511795
Shortest Summary Length: 11


In [14]:
# create train, test, validation splits based on the cleaned dataset
train, test = train_test_split(df_ready, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

print(len(train), len(val), len(test))

21700 2412 2680


In [15]:
# convert DataFrames to HuggingFace Datasets to be compatible with models
train_ds = Dataset.from_pandas(train.reset_index(drop=True))
val_ds = Dataset.from_pandas(val.reset_index(drop=True))
test_ds = Dataset.from_pandas(test.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

dataset.save_to_disk("bhc_cleaned_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 21700/21700 [00:00<00:00, 451875.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2412/2412 [00:00<00:00, 320269.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2680/2680 [00:00<00:00, 207202.48 examples/s]
