In [None]:
import hydra
import pandas as pd
import wandb
import itertools
from datasets import load_dataset
from dotenv import load_dotenv
from langchain.output_parsers import OutputFixingParser
from langchain.schema import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from omegaconf import DictConfig, OmegaConf
from pandas import json_normalize
from tqdm import tqdm

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

**ISSUES**: Error in splitting titles and sections -> REGEX?
   - Mix between different sections and title
   - Wrong `section_title value`
   - `section_content` overlaps information of other sections (e.g., check date columns)
   - Duplication in `section_title` in one document even same following up position in documents

**NOTES**
1. Tested sections: `allergies|history of present illness|past medical history|discharge medications|social history|medications on admission`

In [None]:
df = load_dataset("bio-datasets/mimoracle", split="train").to_pandas()
df.head()

In [None]:
def _preprocess(text: str) -> str:
    text = text.split("\n")[-1].lower()
    return text


def _resample(df: pd.DataFrame, n_sample: int, n_section: int) -> pd.DataFrame:
    patterns = "allergies|history of present illness|past medical history|discharge medications|social history|medications on admission"
    df["section_title"] = [_preprocess(x) for x in df["section_title"]]
    df = df[df.section_title.str.contains(patterns)]
    df = df.groupby("section_title").filter(lambda x: len(x) > n_sample)
    df = df.groupby("document_id").filter(lambda x: len(x) == n_section)
    return df

In [None]:
sample_df = _resample(df, 2, 6)
sample_df.head()

In [None]:
sample_df.to_csv("mimoracle_train.csv", index=False)

In [None]:
df = load_dataset("bio-datasets/mimoracle", split="test").to_pandas()
sample_df = _resample(df, 2, 6)
sample_df.to_csv("mimoracle_test.csv", index=False)