In [None]:
import os
import pandas as pd
import numpy as np
from tableone import TableOne
import re

# ------------------------------------------------------------
# 1. CONFIGURATION
# ------------------------------------------------------------
LABELS_PATH = "/Users/luisnakayama/Downloads/labels_mbrset.csv"
IMG_COL = "file"
PATIENT_COL = "patient"
DR_COL = "final_icdr"
EDEMA_COL = "final_edema"
SEX_COL = "sex"

# ------------------------------------------------------------
# 2. LOAD AND FILTER MACULA (.1 AND .3)
# ------------------------------------------------------------
df = pd.read_csv(LABELS_PATH)
macula_regex = re.compile(r'\.[13](\.jpg)?$', re.IGNORECASE)
df_macula = df[df[IMG_COL].astype(str).apply(lambda x: bool(macula_regex.search(x)))].copy()

# ------------------------------------------------------------
# 3. STANDARDIZING LABELS AND DEFINING TASKS
# ------------------------------------------------------------
# A. Convert Edema "yes/no" to 1/0
if EDEMA_COL in df_macula.columns:
    df_macula[EDEMA_COL] = df_macula[EDEMA_COL].astype(str).str.lower().str.strip()
    df_macula['edema_bin'] = df_macula[EDEMA_COL].map({'yes': 1, 'no': 0}).fillna(0).astype(int)

# B. Standardize ICDR
df_macula[DR_COL] = pd.to_numeric(df_macula[DR_COL], errors='coerce')
df_macula = df_macula.dropna(subset=[DR_COL])

# C. Task: Any DR (0 vs 1,2,3,4)
df_macula['Task_Any_DR'] = (df_macula[DR_COL] >= 1).astype(int)

# D. Task: Referable DR (ICDR >= 2 OR Edema == 1)
df_macula['Task_Referable'] = (
    (df_macula[DR_COL] >= 2) | (df_macula['edema_bin'] == 1)
).astype(int)

# E. Task: 3-Class Severity (0 vs 1-3 vs 4)
# 0: Healthy, 1: Mild/Mod/Severe NPDR, 2: PDR (Proliferative)
def classify_3_classes(row):
    if row[DR_COL] == 0: return 0
    elif 1 <= row[DR_COL] <= 3: return 1
    else: return 2 # ICDR 4

df_macula['Task_3_Classes'] = df_macula.apply(classify_3_classes, axis=1)

# F. Demographics Mapping
df_macula['Sex_Label'] = df_macula[SEX_COL].map({0: 'Female', 1: 'Male'})
if 'insurance' in df_macula.columns:
    df_macula['insurance_label'] = df_macula['insurance'].map({0: 'Public', 1: 'Private'})

# ------------------------------------------------------------
# 4. PATIENT-LEVEL DESCRIPTION (TABLEONE)
# ------------------------------------------------------------
df_patient = df_macula.groupby(PATIENT_COL).first()

columns = [
    'Sex_Label', 'educational_level', 'insurance_label',
    'edema_bin', DR_COL, 'Task_Any_DR', 'Task_Referable', 'Task_3_Classes'
]

categorical = [
    'Sex_Label', 'educational_level', 'insurance_label',
    'edema_bin', 'Task_Any_DR', 'Task_Referable', 'Task_3_Classes'
]

mytable = TableOne(df_patient,
                   columns=[c for c in columns if c in df_patient.columns],
                   categorical=[c for c in categorical if c in df_patient.columns],
                   groupby='Task_3_Classes',
                   pval=True,
                   label_suffix=True)

print("\n>>> MBRSET MACULA COHORT (Grouped by 3-Class Severity):")
print(mytable.tabulate(tablefmt="fancy_grid"))
