In [1]:
import pandas as pd
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import re
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from multiprocessing import Pool
import random
from os import listdir
import pandas as pd
import time
from pathlib import Path
import re
from tqdm import tqdm
import openai
import time
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"

In [3]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], '../'))
from utils.prompting import *
from script.section_extraction import *
from script.radiology_report_selection import *
from script.target_section_generation import *

# Load Dataset

In [4]:
def merge_two_dfs_no_dup_cols(df1, df2, merge_col):
    cols_to_use = df2.columns.difference(df1.columns).tolist() + merge_col
    return df1.merge(df2[cols_to_use], on=merge_col)

In [5]:
# dataset = "sample"
dataset = "test_phase_2"

In [6]:
data_path = f"../data/{dataset}/"

In [7]:
dfs = {}
df_discharge = pd.read_csv(os.path.join(data_path, "discharge.csv.gz"), keep_default_na=False)
df_radiology = pd.read_csv(os.path.join(data_path, "radiology.csv.gz"), keep_default_na=False)
merged_df = merge_two_dfs_no_dup_cols(df_discharge, df_radiology.rename(columns={'text': 'radiology_text'}), ['hadm_id'])
merge_col = merged_df.drop(columns=['radiology_text']).columns.tolist()
merged_df = merged_df.groupby(merge_col).agg({
    'radiology_text': lambda x: x.tolist()
}).reset_index()
df = merged_df

# Section Extraction (Parsing)

In [8]:
# SECTIONS REPORT
s_list = []
for section in input_sections.keys():
    s = pd.Series()
    s.name = section
    
    size = df.shape[0]
    filtered_size = df[df['text'].str.contains(section)].shape[0]
    s['dist'] = filtered_size / size    
    s_list += [s]

In [9]:
pd.DataFrame(s_list)

Unnamed: 0,dist
Allergies,1.0
Chief Complaint,0.96798
Major Surgical or Invasive Procedure,0.997811
History of Present Illness,0.977468
Past Medical History,0.978106
Social History,0.971538
Family History,0.966794
Physical Exam,0.954753
Pertinent Results,0.979748
Brief Hospital Course,1.0


## Parse

In [10]:
df = df.parallel_apply(parse_sections, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1097), Label(value='0 / 1097'))), …

## Post-processing

In [11]:
sections_col = [col.replace(" ", "_") for col in input_sections.keys()]

In [12]:
for col in sections_col:
    mask = pd.notnull(df[col])
    df.loc[mask, col] = df.loc[mask, col].apply(lambda x: x[1])

Remove Target from the Input Text

In [13]:
df = df.apply(remove_output_from_input, axis=1)
df = df.rename(columns={'new_text': 'processed_text'})

Calculate Number of Words

In [14]:
df = df.apply(calculate_word_count, axis=1)

# Radiology Report Selection

In [15]:
df = df.apply(replace_pertinent_results_with_radiology, axis=1)

In [16]:
df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,radiology_text,Allergies,...,Discharge_Medications,Discharge_Disposition,Discharge_Diagnosis,Discharge_Condition,Discharge_Instructions,processed_text,Brief_Hospital_Course_Word_Count,Discharge_Instructions_Word_Count,Physical_Exam_Word_Count,Pertinent_Results_Word_Count
0,10001884-DS-35,10001884,24962904,DS,35,2130-12-08 00:00:00,2130-12-13 21:50:00,\nName: ___ Unit No: ___\n \nA...,[EXAMINATION:\nChest: Frontal and lateral vie...,"IV Dye, Iodine Containing Contrast Media / Oxy...",...,1. Acetaminophen 325 mg PO Q4H:PRN Pain \n2. a...,Extended Care,PRIMARY:\nCOPD Exacerbation\n\nSECONDARY:\nAfi...,Mental Status: Clear and coherent.,"Dear Ms. ___,\n\nYou were admitted to ___ afte...",\nName: ___ Unit No: ___\n \nA...,444,87,139,104
1,10003019-DS-22,10003019,22774359,DS,22,2175-10-17 00:00:00,2175-10-24 14:40:00,\nName: ___. Unit No: ___\n \...,"[EXAM: Chest, frontal and lateral views.\n\nC...",Ragweed / morphine / Percocet,...,1. Acyclovir 400 mg PO Q8H \n2. Fluconazole 40...,Home With Service,"Primary Diagnosis\nNeutropenic Fever, no sourc...",Mental Status: Clear and coherent.,"Dear Mr. ___,\n\nIt has been our pleasure to b...",\nName: ___. Unit No: ___\n \...,698,140,212,1212
2,10003299-DS-7,10003299,29323205,DS,7,2181-10-23 00:00:00,2181-10-24 07:08:00,\nName: ___ Unit No: ___\n...,[EXAMINATION: CT HEAD W/O CONTRAST Q111 CT HE...,Iodine-Iodine Containing,...,1. Aspirin 81 mg PO DAILY \n2. Atorvastatin...,Home With Service,Ischemic stroke,Mental Status: Clear and coherent.,Dear ___ were hospitalized due to symptoms of ...,\nName: ___ Unit No: ___\n...,386,242,734,329
3,10003502-DS-7,10003502,20459702,DS,7,2166-02-19 00:00:00,2166-02-20 09:32:00,\nName: ___ Unit No: ___\n...,[HISTORY: Nausea and chllls.\n\nTECHNIQUE: F...,nifedipine / Amitriptyline / Prilosec OTC / Te...,...,1. Acetaminophen 500 mg PO Q6H:PRN pain \n2. A...,Home With Service,Primary: \nAcute diastolic CHF exacerbation\nN...,Mental Status: Clear and coherent.,It was a pleasure caring for you at ___ \n___....,\nName: ___ Unit No: ___\n...,565,74,90,296
4,10004322-DS-21,10004322,28755331,DS,21,2131-01-26 00:00:00,2131-01-26 18:32:00,\nName: ___ Unit No: ___\n...,[CHEST RADIOGRAPH PERFORMED ON ___\n\nCOMPARIS...,No Known Allergies / Adverse Drug Reactions,...,1. Acetaminophen 650 mg PO Q6H:PRN pain \n2. A...,Home With Service,Primary: mechanical fall,Mental Status: Confused - sometimes.,"Dear Mr. ___,\n\nIt was our pleasure to care f...",\nName: ___ Unit No: ___\n...,300,69,185,270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10957,19995366-DS-16,19995366,23598426,DS,16,2148-05-18 00:00:00,2148-05-18 14:33:00,\nName: ___ Unit No: ___...,[INDICATION: Isolated right sixth nerve palsy...,No Known Allergies / Adverse Drug Reactions,...,1. Simvastatin 20 mg PO DAILY,Home,Abducens nerve palsy.,Mental Status: Clear and coherent.,You came to the hospital with double vision (d...,\nName: ___ Unit No: ___...,150,109,469,70
10958,19997367-DS-24,19997367,29933340,DS,24,2128-01-21 00:00:00,2128-01-22 21:41:00,\nName: ___ Unit No: ___\...,[EXAMINATION: CHEST (PA AND LAT)\n\nINDICATIO...,Cipro Cystitis / Dilaudid / Mexiletine / Bactr...,...,1. FoLIC Acid 1 mg PO DAILY \n2. Rifaximin 550...,Home With Service,PRIMARY\nHepatic Encephalopathy\nUTI\nPulmonar...,Mental Status: Clear and coherent.,"Dear Ms. ___,\n\nIt was a pleasure being a par...",\nName: ___ Unit No: ___\...,502,123,155,196
10959,19997367-DS-25,19997367,22314636,DS,25,2128-01-31 00:00:00,2128-02-01 17:02:00,\nName: ___ Unit No: ___\...,[EXAMINATION: CHEST (AP AND LATERAL)\n\nINDI...,Cipro Cystitis / Dilaudid / Mexiletine / Bactr...,...,1. Acyclovir 400 mg PO Q12H \n2. Aspirin 81 mg...,Home With Service,Primary diagnoses:\n- Acute on chronic diastol...,Mental Status: Clear and coherent.,"Dear ___,\n\n___ was a pleasure taking care of...",\nName: ___ Unit No: ___\...,348,125,124,452
10960,19997843-DS-13,19997843,20277361,DS,13,2120-11-28 00:00:00,2120-11-28 16:16:00,\nName: ___ Unit No: ___...,[EXAMINATION: CHEST (PORTABLE AP)\n\nINDICATI...,No Allergies/ADRs on File,...,,Home,PRIMARY DIAGNOSES\n# Complex Alcohol Withdrawa...,Mental Status: Confused - sometimes.,"___,\n\nFue un placer atenderlo en ___ Médico ...",\nName: ___ Unit No: ___...,722,159,160,374


# Target Section Summarization

## Brief Hospital Course Summarization

In [17]:
df['processed_text_word_count'] = df['processed_text'].apply(lambda x: len(x.split(" ")))
df = df.sort_values(by=['processed_text_word_count'], ascending=False)

In [18]:
thres = 1000
df['category'] = df['processed_text_word_count'].apply(lambda x: 1 if x < thres else 0)

In [19]:
mask = (df['processed_text_word_count'] >= 1000) & (df['processed_text_word_count'] <= 1300)
df.loc[mask, 'category'] = 2

In [20]:
root_path = f'../data/{dataset}/brief_hospital_course_cache/'
num_workers = 1

In [21]:
inputs = [(root_path,
           'brief_hospital_course',
           domain,
           df[df['category'] == domain].reset_index(drop=True),
           100,
           )
          for domain in [0, 1, 2]]
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(target_section_summarization, inputs)

0 :  Loaded cached file. Done
1 :  Loaded cached file. Done
2 :  Loaded cached file. Done


In [22]:
processed_df = pd.concat(data).drop_duplicates(['hadm_id'])
processed_df.shape

(10962, 2)

In [23]:
df = df.merge(processed_df, on=['hadm_id'])

# Post-processing

In [24]:
mask = df['brief_hospital_course'].apply(lambda x: len(re.findall(r'(\#[^\n]+\n){8,}', x)) > 0)
df.loc[mask, 'brief_hospital_course'] = df[mask].apply(fix_hallucination, axis=1)

In [25]:
df['brief_hospital_course'] = df['brief_hospital_course'].apply(remove_repitition)
df['brief_hospital_course'] = df['brief_hospital_course'].apply(lambda x: "\n\n".join(x.split("\n\n")[0:3]))
df['brief_hospital_course'] = df['brief_hospital_course'].apply(lambda text: re.sub(r'([A-Za-z0-9,._][ \s])\n([A-Za-z0-9])', r'\1\2', text))
df['brief_hospital_course'] = df['brief_hospital_course'].apply(remove_sent_repitition)

In [26]:
mask = pd.notnull(df['brief_hospital_course'])
df.loc[mask, 'brief_hospital_course'] = df[mask].parallel_apply(post_process_brief_hospital_course, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1097), Label(value='0 / 1097'))), …

In [27]:
print(df['brief_hospital_course'].iloc[0])

___ with metastatic breast cancer, s/p 6 weeks of steroids for pneumonia, now s/p 21 days of Bactrim for PCP ___.
ACUTE:

# PCP ___: 
Presented to ___ clinic ___ with cough, fever, chest tightness, CXR c/f bilateral pleural effusions and infiltrates, sent to ED where she had a CT chest suspicious for multifocal PNA, started on vanc/cefepime. ID and Pulmonary were consulted. Patient underwent bronch ___, BAL positive for Pneumocystis jirovecii, was switched to IV Bactrim ___, and PO prednisone taper was initiated to decrease inflammation. Transitioned to PO Bactrim ___ for total 21d course (last day ___. Became hypoxic requiring 5L NC but was satting well on RA by discharge. Final acid fast cultures pending on discharge.


In [28]:
df[['hadm_id', 'brief_hospital_course']].to_csv(f"../data/{dataset}/brief_hospital_course.csv", index=False)