In [1]:
import pandas as pd
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import re
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from multiprocessing import Pool
import random
from os import listdir
import pandas as pd
import time
from pathlib import Path
import re
from tqdm import tqdm
import openai
import time
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"

In [3]:
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], '../'))
from utils.prompting import *
from script.section_extraction import *
from script.radiology_report_selection import *
from script.target_section_generation import *

# Load Dataset

In [4]:
# dataset = "sample"
dataset = "test_phase_2"

In [5]:
data_path = f"../data/{dataset}/"

In [6]:
dfs = {}
df_discharge = pd.read_csv(os.path.join(data_path, "discharge.csv.gz"), keep_default_na=False)
df_radiology = pd.read_csv(os.path.join(data_path, "radiology.csv.gz"), keep_default_na=False)
merged_df = merge_two_dfs_no_dup_cols(df_discharge, df_radiology.rename(columns={'text': 'radiology_text'}), ['hadm_id'])
merge_col = merged_df.drop(columns=['radiology_text']).columns.tolist()
merged_df = merged_df.groupby(merge_col).agg({
    'radiology_text': lambda x: x.tolist()
}).reset_index()
df = merged_df

# Section Extraction (Parsing)

In [7]:
# SECTIONS REPORT
s_list = []
for section in input_sections.keys():
    s = pd.Series()
    s.name = section
    
    size = df.shape[0]
    filtered_size = df[df['text'].str.contains(section)].shape[0]
    s['dist'] = filtered_size / size    
    s_list += [s]

In [8]:
pd.DataFrame(s_list)

Unnamed: 0,dist
Allergies,1.0
Chief Complaint,0.96798
Major Surgical or Invasive Procedure,0.997811
History of Present Illness,0.977468
Past Medical History,0.978106
Social History,0.971538
Family History,0.966794
Physical Exam,0.954753
Pertinent Results,0.979748
Brief Hospital Course,1.0


## Parse

In [9]:
df = df.parallel_apply(parse_sections, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1097), Label(value='0 / 1097'))), …

In [10]:
sections_col = [col.replace(" ", "_") for col in input_sections.keys()]

In [11]:
for col in sections_col:
    mask = pd.notnull(df[col])
    df.loc[mask, col] = df.loc[mask, col].apply(lambda x: x[1])

In [12]:
df.to_pickle(os.path.join(data_path, "discharge_extracted.pkl"))

# Radiology Report Selection

In [13]:
df = pd.read_pickle(os.path.join(data_path, "discharge_extracted.pkl"))

In [14]:
df = df.apply(replace_pertinent_results_with_radiology, axis=1)

In [15]:
df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,radiology_text,Allergies,...,Family_History,Physical_Exam,Pertinent_Results,Brief_Hospital_Course,Medications_on_Admission,Discharge_Medications,Discharge_Disposition,Discharge_Diagnosis,Discharge_Condition,Discharge_Instructions
0,10001884-DS-35,10001884,24962904,DS,35,2130-12-08 00:00:00,2130-12-13 21:50:00,\nName: ___ Unit No: ___\n \nA...,[EXAMINATION:\nChest: Frontal and lateral vie...,"IV Dye, Iodine Containing Contrast Media / Oxy...",...,Mother with asthma and hypertension. Father wi...,ADMISSION PHYSICAL EXAM:\n====================...,ADMISSION LABS: \n=========================\n_...,Ms. ___ is a ___ female with history of \nCOPD...,The Preadmission Medication list is accurate a...,1. Acetaminophen 325 mg PO Q4H:PRN Pain \n2. a...,Extended Care,PRIMARY:\nCOPD Exacerbation\n\nSECONDARY:\nAfi...,Mental Status: Clear and coherent.,"Dear Ms. ___,\n\nYou were admitted to ___ afte..."
1,10003019-DS-22,10003019,22774359,DS,22,2175-10-17 00:00:00,2175-10-24 14:40:00,\nName: ___. Unit No: ___\n \...,"[EXAM: Chest, frontal and lateral views.\n\nC...",Ragweed / morphine / Percocet,...,"Mother: ___, cardiac disease. \nFather: diver...",ADMISSION EXAM\nVitals: 124/67 on neosynephrin...,ADMISSION LABS\n___ 10:40AM BLOOD WBC-0.2* RBC...,___ male with h/o Hodgkin's lymphoma C1D17 ABV...,The Preadmission Medication list is accurate a...,1. Acyclovir 400 mg PO Q8H \n2. Fluconazole 40...,Home With Service,"Primary Diagnosis\nNeutropenic Fever, no sourc...",Mental Status: Clear and coherent.,"Dear Mr. ___,\n\nIt has been our pleasure to b..."
2,10003299-DS-7,10003299,29323205,DS,7,2181-10-23 00:00:00,2181-10-24 07:08:00,\nName: ___ Unit No: ___\n...,[EXAMINATION: CT HEAD W/O CONTRAST Q111 CT HE...,Iodine-Iodine Containing,...,Mother had stroke in her ___ or ___. Her pate...,Admission Exam:\nVitals: T: 97.4 P: 65 R: 16 ...,___ 01:10PM GLUCOSE-125* UREA N-9 CREAT-0.9 ...,___ RH female with a PMHx of paramedian pontin...,The Preadmission Medication list is accurate a...,1. Aspirin 81 mg PO DAILY \n2. Atorvastatin...,Home With Service,Ischemic stroke,Mental Status: Clear and coherent.,Dear ___ were hospitalized due to symptoms of ...
3,10003502-DS-7,10003502,20459702,DS,7,2166-02-19 00:00:00,2166-02-20 09:32:00,\nName: ___ Unit No: ___\n...,[HISTORY: Nausea and chllls.\n\nTECHNIQUE: F...,nifedipine / Amitriptyline / Prilosec OTC / Te...,...,Mother deceased at ___ yo from breast cancer. ...,On admission: \nVS 97.4 140/P 62 18 96\nGEN Al...,___ 10:15AM BLOOD WBC-6.4 RBC-3.64* Hgb-11.1* ...,Hospitalization Summary: \nMs. ___ is an ___ y...,The Preadmission Medication list is accurate a...,1. Acetaminophen 500 mg PO Q6H:PRN pain \n2. A...,Home With Service,Primary: \nAcute diastolic CHF exacerbation\nN...,Mental Status: Clear and coherent.,It was a pleasure caring for you at ___ \n___....
4,10004322-DS-21,10004322,28755331,DS,21,2131-01-26 00:00:00,2131-01-26 18:32:00,\nName: ___ Unit No: ___\n...,[CHEST RADIOGRAPH PERFORMED ON ___\n\nCOMPARIS...,No Known Allergies / Adverse Drug Reactions,...,Unknown to patient.,ADMISSION PHYSICAL EXAM: \nVS: 98.1 117/62 97 ...,ADMISSION LABS:\n___ 03:40PM BLOOD WBC-17.7* R...,___ with h/o psychosis admitted because of mul...,The Preadmission Medication list is accurate a...,1. Acetaminophen 650 mg PO Q6H:PRN pain \n2. A...,Home With Service,Primary: mechanical fall,Mental Status: Confused - sometimes.,"Dear Mr. ___,\n\nIt was our pleasure to care f..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10957,19995366-DS-16,19995366,23598426,DS,16,2148-05-18 00:00:00,2148-05-18 14:33:00,\nName: ___ Unit No: ___...,[INDICATION: Isolated right sixth nerve palsy...,No Known Allergies / Adverse Drug Reactions,...,Family Hx: mother had a brain aneurysm at age ...,Physical Exam: \n\nVitals: T: 97.8 P: 70 R: 16...,Pertinent Laboratory Data\n\nAlbumin-4.3 Calci...,Right Abducens Nerve Palsy\nThe patient was ad...,1. Simvastatin 20 mg PO DAILY,1. Simvastatin 20 mg PO DAILY,Home,Abducens nerve palsy.,Mental Status: Clear and coherent.,You came to the hospital with double vision (d...
10958,19997367-DS-24,19997367,29933340,DS,24,2128-01-21 00:00:00,2128-01-22 21:41:00,\nName: ___ Unit No: ___\...,[EXAMINATION: CHEST (PA AND LAT)\n\nINDICATIO...,Cipro Cystitis / Dilaudid / Mexiletine / Bactr...,...,"Father with MS. ___, denies sister and mother ...",ADMISSION PHYSICAL EXAMINATION\n==============...,ADMISSION LABS\n====================\n___ 05:1...,Ms. ___ is a ___ y/o woman with a complicated ...,The Preadmission Medication list is accurate a...,1. FoLIC Acid 1 mg PO DAILY \n2. Rifaximin 550...,Home With Service,PRIMARY\nHepatic Encephalopathy\nUTI\nPulmonar...,Mental Status: Clear and coherent.,"Dear Ms. ___,\n\nIt was a pleasure being a par..."
10959,19997367-DS-25,19997367,22314636,DS,25,2128-01-31 00:00:00,2128-02-01 17:02:00,\nName: ___ Unit No: ___\...,[EXAMINATION: CHEST (AP AND LATERAL)\n\nINDI...,Cipro Cystitis / Dilaudid / Mexiletine / Bactr...,...,"Father with MS. ___, denies sister and mother ...","ADMISSION PHYSICAL EXAM:\nVitals - 97.8, 106/3...",ADMISSION LABS:\n================\n___ 11:16PM...,"___ PMH of Cirrhosis ___ hemochromatosis, c/b ...",The Preadmission Medication list is accurate a...,1. Acyclovir 400 mg PO Q12H \n2. Aspirin 81 mg...,Home With Service,Primary diagnoses:\n- Acute on chronic diastol...,Mental Status: Clear and coherent.,"Dear ___,\n\n___ was a pleasure taking care of..."
10960,19997843-DS-13,19997843,20277361,DS,13,2120-11-28 00:00:00,2120-11-28 16:16:00,\nName: ___ Unit No: ___...,[EXAMINATION: CHEST (PORTABLE AP)\n\nINDICATI...,No Allergies/ADRs on File,...,No family history of liver disease,ADMISSION PHYSICAL EXAM: \n===================...,===============\nAdmission labs\n=============...,SUMMARY: \n___ y/o ___ immigrant with h/o alco...,The Preadmission Medication list is accurate a...,,Home,PRIMARY DIAGNOSES\n# Complex Alcohol Withdrawa...,Mental Status: Confused - sometimes.,"___,\n\nFue un placer atenderlo en ___ Médico ..."


# Target Section Summarization

## Read generated Brief Hospital Course

In [16]:
brief_hospital_course_df = pd.read_csv(f"../data/{dataset}/brief_hospital_course.csv")

In [17]:
df = df.merge(brief_hospital_course_df, on=['hadm_id'])

## Pre-processing

Remove Target from the Input Text

In [18]:
df = df.apply(remove_output_from_input, axis=1)
df = df.rename(columns={'new_text': 'processed_text'})

Calculate Number of Words

In [19]:
df = df.apply(calculate_word_count, axis=1)

## Discharge Instructions Summarization

In [20]:
df['processed_text_word_count'] = df['processed_text'].apply(lambda x: len(x.split(" ")))
df = df.sort_values(by=['processed_text_word_count'], ascending=False)

In [21]:
thres = 1000
df['category'] = df['processed_text_word_count'].apply(lambda x: 1 if x < thres else 0)

In [22]:
mask = (df['processed_text_word_count'] >= 1000) & (df['processed_text_word_count'] <= 1300)
df.loc[mask, 'category'] = 2

In [23]:
root_path = f'../data/{dataset}/discharge_instructions_cache/'
num_workers = 1

In [24]:
inputs = [(root_path,
           'discharge_instructions',
           domain,
           df[df['category'] == domain].reset_index(drop=True),
           100,
           )
          for domain in [0, 1, 2]]
start_time = time.time()
with Pool(num_workers) as processor:
    data = processor.starmap(target_section_summarization, inputs)

0 :  Loaded cached file. Done
1 :  Loaded cached file. Done
2 :  Loaded cached file. Done


In [25]:
processed_df = pd.concat(data).drop_duplicates(['hadm_id'])
processed_df.shape

(10962, 2)

In [26]:
df = df.merge(processed_df, on=['hadm_id'])

# Post-processing

In [27]:
df['discharge_instructions'] = df['discharge_instructions'].apply(remove_repitition)
df['discharge_instructions'] = df['discharge_instructions'].apply(lambda text: re.sub(r'([A-Za-z0-9,._][ \s])\n([A-Za-z0-9])', r'\1\2', text))
df['discharge_instructions'] = df['discharge_instructions'].apply(remove_sent_repitition)

In [28]:
mask = pd.notnull(df['discharge_instructions'])
df.loc[mask, 'discharge_instructions'] = df[mask].parallel_apply(post_process_discharge_instructions, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1097), Label(value='0 / 1097'))), …

In [29]:
df = df.drop_duplicates(subset=['hadm_id'])

In [30]:
print(df['discharge_instructions'].iloc[0])

Dear Ms. ___,
It was a pleasure taking care of you at ___. You were admitted for pneumonia. You were treated with antibiotics and prednisone. You are now ready to be discharged. Please follow the instructions below to continue your recovery:

Please call your doctor or nurse practitioner or return to the Emergency Department for any of the following:
*You experience new chest pain, pressure, squeezing or tightness.
*New or worsening cough, shortness of breath, or wheeze.
*If you are vomiting and cannot keep down fluids or your medications.
*You are getting dehydrated due to continued vomiting, diarrhea, or other reasons. Signs of dehydration include dry mouth, rapid heartbeat, or feeling dizzy or faint when standing.
*You see blood or dark/black material when you vomit or have a bowel movement.
*You experience burning when you urinate, have blood in your urine, or experience a discharge.
*Your pain increases or becomes constant, or changes location. Or you have new pain that is severe 

In [31]:
df[['hadm_id', 'discharge_instructions']].to_csv(f"../data/{dataset}/discharge_instructions.csv", index=False)