In [1]:
import hydra
import pandas as pd
import wandb
import itertools
from datasets import load_dataset
from dotenv import load_dotenv
from langchain.output_parsers import OutputFixingParser
from langchain.schema import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from omegaconf import DictConfig, OmegaConf
from pandas import json_normalize
from tqdm import tqdm

In [2]:
# from huggingface_hub import notebook_login
# notebook_login()

**ISSUES**: Error in splitting titles and sections -> REGEX?
   - Mix between different sections and title
   - Wrong `section_title value`
   - `section_content` overlaps information of other sections (e.g., check date columns)
   - Duplication in `section_title` in one document even same following up position in documents

**NOTES**
1. Tested sections: `allergies|history of present illness|past medical history|discharge medications|social history|medications on admission`

In [3]:
df = load_dataset("bio-datasets/mimoracle", split="train").to_pandas()
df.head()

Unnamed: 0,subject_id,document_id,chartdate,text,section_title,section_content,section_start,section_end
0,68,9138,2174-01-03 00:00:00.000000,Admission Date: [**2173-12-15**] ...,Admission Date,[**2173-12-15**] Discharge Date: ...,0,80
1,68,9138,2174-01-03 00:00:00.000000,Admission Date: [**2173-12-15**] ...,Date of Birth,[**2132-2-29**] Sex: F,80,134
2,68,9138,2174-01-03 00:00:00.000000,Admission Date: [**2173-12-15**] ...,Allergies,Nevirapine / Abacavir / Ampicillin / Tylenol /...,153,260
3,68,9138,2174-01-03 00:00:00.000000,Admission Date: [**2173-12-15**] ...,Chief Complaint,"Productive cough, fever",260,301
4,68,9138,2174-01-03 00:00:00.000000,Admission Date: [**2173-12-15**] ...,None\n\n\nHistory of Present Illness,Ms. [**Known lastname 31473**] is a 41yo F wit...,340,1324


In [4]:
def _preprocess(text: str) -> str:
    text = text.split('\n')[-1].lower()
    return text

def _resample(df: pd.DataFrame, n_sample:int, n_section:int) -> pd.DataFrame:
    patterns = 'allergies|history of present illness|past medical history|discharge medications|social history|medications on admission'
    df['section_title'] = [_preprocess(x) for x in df['section_title']]
    df = df[df.section_title.str.contains(patterns)]
    df = df.groupby('section_title').filter(lambda x: len(x) > n_sample)
    df = df.groupby('document_id').filter(lambda x: len(x) == n_section)
    return df

In [5]:
sample_df = _resample(df, 2, 6)
sample_df.head()

Unnamed: 0,subject_id,document_id,chartdate,text,section_title,section_content,section_start,section_end
27,68,9139,2174-01-18 00:00:00.000000,Admission Date: [**2174-1-4**] D...,allergies,Nevirapine / Abacavir / Ampicillin / Tylenol /...,152,261
30,68,9139,2174-01-18 00:00:00.000000,Admission Date: [**2174-1-4**] D...,history of present illness,41F with advanced HIV/AIDS (last CD4 5 in [**8...,463,2124
31,68,9139,2174-01-18 00:00:00.000000,Admission Date: [**2174-1-4**] D...,past medical history,"HIV/AIDS - h/o PCP x 2, MAC, cervical dysplasi...",2124,2373
32,68,9139,2174-01-18 00:00:00.000000,Admission Date: [**2174-1-4**] D...,social history,Divorced. Lives in apartment with 13 yo daught...,2373,2605
44,68,9139,2174-01-18 00:00:00.000000,Admission Date: [**2174-1-4**] D...,medications on admission,(unclear which meds pt was taking for the 2 da...,6764,7270


In [9]:
sample_df.to_csv('mimoracle_sample.csv', index=False)