# This notebook preprocesses MIMIC-III tables

In [None]:
# Generic imports
import pandas as pd
from pathlib import Path

In [None]:
# Custom imports
from src.processing_tools import (
    read_in_files,
    preprocess_tables
)   

In [None]:
# Custom display of tables for easier inspection
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

This notebook aims to process datasets from MIMIC III for downstream automated ARDS/control adjudication.  
Specifically, this notebook follows steps highlighted in bold-face:  
- **File I/O, which depends on a specs file.**  
- **Standard preprocessing, in which column names are standardized.**  
- **Hospital-specific processing, which is temporarily custom-made.**  
- Anonymization. It follows two substeps:  
    + Anonymizing patient/encounter IDs and datetime columns for all tables.
    + Anonymizing text-based tables (chest X-ray reports, attending notes, ECHO reports).  
- Segmentation of text-based tables.

## Reading in raw tables

In [None]:
dictionary, pf, peep, bi, notes, echo, bnp, specs = read_in_files()

## General preprocessing of tables

In [None]:
dictionary, pf, peep, bi, notes, echo, bnp = preprocess_tables(
    dictionary,
    pf,
    peep,
    bi,
    notes,
    echo,
    bnp
    )

## Specific processing and counts of tables for this cohort

#### PF_ratio (hypoxemia assessment)

In [None]:
print(f"Patients with PF ratios: {pf.patient_id.nunique()}")
print(f"Encounters with PF ratios: {pf.encounter_id.nunique()}")
print(f"ICU stays with PF ratios: {pf.icu_id.nunique()}")
print(f"Uniquely-identified PF ratio entries: {len(pf)}")

#### Chest X-ray reports (bilateral infiltrates assessment)

Nothing to process

In [None]:
print(f"Encounters with CXR reports: {bi.encounter_id.nunique()}")
print(f"Uniquely-identified CXR reports in table: {len(bi)}")

#### Attending physician notes (risk factors and cardiac failure rule out)

In [None]:
print(f"Patients with attending notes: {notes.patient_id.nunique()}")
print(f"Encounters with attending notes: {notes.encounter_id.nunique()}")
print(f"Number of uniquely-identified attending notes: {len(notes)}")

### Echocardiography reports (objective cardiac failure rule out)

In [None]:
print(f"Number of unique patients : {echo.patient_id.nunique()}")
print(f"Encounters with Echocardiography reports: {echo.encounter_id.nunique()}")
print(f"Uniquely-identified Echocardiography reports: {len(echo)}")

#### Beta/Brain Natriuretic Peptide (objective cardiac failure rule out)

Will merge to dictionary table so each patient_id corresponds to an encounter_id. In this case, it won't multiply number of rows.

In [None]:
print(f"Patients with BNP: {bnp.patient_id.nunique()}")
print(f"Encounters with BNP: {bnp.encounter_id.nunique()}")
print(f"Uniquely-identified BNPs: {len(bnp)}")

## Storing files

In [None]:
basedir = Path("..")
preprocess_location = basedir / 'Preprocessed_data'
cohort = 'MIMIC_III'
path = preprocess_location / cohort / 'labeled_subset'

In [None]:
pf.to_csv(path / "pf_ratio.csv", index=False, date_format=specs['date_format'])

In [None]:
if peep is not None:
    peep.to_csv(path / "peep.csv", index=False, date_format=specs['date_format'])

In [None]:
bi.to_csv(path / "cxr.csv", index=False, date_format=specs['date_format'])

In [None]:
notes.to_csv(path / "attending_notes.csv", index=False, date_format=specs['date_format'])

In [None]:
echo.to_csv(path / "echo_reports.csv", index=False, date_format=specs['date_format'])

In [None]:
bnp.to_csv(path / "bnp.csv", index=False, date_format=specs['date_format'])