In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np

csv_path = "/Users/mayasachidanand/Downloads/cs4120/project/mimic-iv-ext-bhc-labeled-clinical-notes-dataset-for-hospital-course-summarization-1.2.0/mimic-iv-bhc.csv"

# creating a 10% sample of the dataset due to size and memory constraints
total_rows = sum(1 for _ in open(csv_path)) - 1
print("Total Rows:", total_rows)
sample_frac = 0.1  # 10%
sample_n = int(total_rows * sample_frac)

# randomly skip rows
skip_idx = sorted(np.random.choice(np.arange(1, total_rows + 1), total_rows - sample_n, replace=False))
df = pd.read_csv(csv_path, skiprows=skip_idx)

print(df.info())
print(df.head())


Total Rows: 270033
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27003 entries, 0 to 27002
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   note_id        27003 non-null  object
 1   input          27003 non-null  object
 2   target         27003 non-null  object
 3   input_tokens   27003 non-null  int64 
 4   target_tokens  27003 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.0+ MB
None
          note_id                                              input  \
0  10001401-DS-21  <SEX> F <SERVICE> MEDICINE <ALLERGIES> No Know...   
1  10001884-DS-29  <SEX> F <SERVICE> MEDICINE <ALLERGIES> IV Dye,...   
2   10002013-DS-4  <SEX> F <SERVICE> PODIATRY <ALLERGIES> Patient...   
3  10002443-DS-15  <SEX> M <SERVICE> MEDICINE <ALLERGIES> No Alle...   
4  10002930-DS-12  <SEX> F <SERVICE> MEDICINE <ALLERGIES> No Know...   

                                              target  input_tokens  \
0  Ms. ___ is 

In [24]:
# remove unnecessary characters 
def clean_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ') # remove newline characters
    text = re.sub(r'\[\*\*.*?\*\*\]', '', text) # remove PHI markers
    text = re.sub(r'[-=]{2,}', ' ', text) # remove repeated punctuation of -, =, maintain _ for deidentified information
    text = re.sub(r'\s+', ' ', text) # remove extra spaces
    return text

# testing cleaning of data
example_row = df.iloc[1]
cleaned_input = clean_text(example_row["input"])
cleaned_output = clean_text(example_row["target"])
print("Original Input:", example_row["input"])
print("Cleaned Input:", cleaned_input)

# clean texts based on above criteria
df["body"] = df["input"].apply(clean_text)
df["summary"] = df["target"].apply(clean_text)

Original Input: <SEX> F <SERVICE> MEDICINE <ALLERGIES> IV Dye, Iodine Containing Contrast Media / Oxycodone / cilostazol / Varenicline <ATTENDING> ___. <CHIEF COMPLAINT> Dyspnea <MAJOR SURGICAL OR INVASIVE PROCEDURE> None. <HISTORY OF PRESENT ILLNESS> ___ y/o F with asthma, CAD s/p stents (reported by patient), COPD, PAD, HTN, who presents with shortness of breath. The patient was sitting at home ewhen she suddenly felt short of breath. She drank some water and took nebulizers which she felt helped some. She also has noticed hoarseness of her voice. She presented because her symptoms have gotten worse throughout the day. Denies fever/chills, sore throat. Says she does have a cough from asthma and noticed increased wheezing. She had an episode of substernal chest pain this morning that lasted ___ minutes that was nonextertional. No radiation to jaw, arm, or back. It resolved without any intervention. She reported some leg swelling but says that has resolved. She denies PND, orthopnea. I

In [26]:
# calculate the mean length and shortest length text
def text_lengths(text):
    mean = text.str.len().mean()
    shortest = text.str.len().min()
    return mean, shortest

# explore the lengths of the texts for the body and summary columns
mean_body, shortest_body = text_lengths(df["body"])
print("Mean Body Length:", mean_body)
print("Shortest Body Length:", shortest_body)

mean_summary, shortest_summary = text_lengths(df["summary"])
print("Mean Summary Length:", mean_summary)
print("Shortest Summary Length:", shortest_summary)

# filter out bad note/summary examples (notes less than 1300 characters & summaries less than 300 characters)
df = df[(df["body"].str.len() > 1300) & (df["summary"].str.len() > 300)]
df_ready = df[["body", "summary"]].dropna()

Mean Body Length: 7637.248062593145
Shortest Body Length: 1231
Mean Summary Length: 2418.4498137108794
Shortest Summary Length: 253


In [27]:
# create train, test, validation splits based on the cleaned dataset
train, test = train_test_split(df_ready, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

print(len(train), len(val), len(test))

21681 2410 2677
