# Create RETINA dataset files

In [88]:
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path
import pandas as pd

In [None]:
MESSIDOR_ROOT = Path("/data/messidor/")
APTOS_ROOT = Path("/data/aptos2019/")
PROJECT_ROOT = Path("path-to-project-root")

## Load MESSIDOR dataset

In [None]:
df_m = pd.read_csv(MESSIDOR_ROOT / "messidor_data.csv")
df_m["diagnosis"] = df_m["adjudicated_dr_grade"]
df_m["site"] = 1
df_m["img_path"] = df_m["image_id"].apply(
    lambda image_name: MESSIDOR_ROOT / "IMAGES" / image_name
)
train_id, val_test_id = train_test_split(np.arange(len(df_m)), train_size=0.40)
val_id, test_id = train_test_split(val_test_id, train_size=0.20)
df_m.loc[train_id, "split"] = "train"
df_m.loc[val_id, "split"] = "val"
df_m.loc[test_id, "split"] = "test"
df_m

## Load APTOS dataset

In [None]:
df_a = pd.read_csv(APTOS_ROOT / "train.csv")
df_a["site"] = 2
df_a["img_path"] = df_a["id_code"].apply(
    lambda image_name: APTOS_ROOT / "train_images" / f"{image_name}.png"
)
df_a
train_id, val_test_id = train_test_split(np.arange(len(df_a)), train_size=0.40)
val_id, test_id = train_test_split(val_test_id, train_size=0.20)
df_a.loc[train_id, "split"] = "train"
df_a.loc[val_id, "split"] = "val"
df_a.loc[test_id, "split"] = "test"
df_a

## Load EyePACS dataset

In [None]:
DATA_DIR_DIABETIC = Path("/data/diabetic_retino")
train_df = pd.read_csv(DATA_DIR_DIABETIC / "trainLabels.csv")
train_df["img_path"] = train_df["image"].apply(
    lambda x: DATA_DIR_DIABETIC / "train" / f"{x}.jpeg"
)
val_test_df = pd.read_csv(DATA_DIR_DIABETIC / "retinopathy_solution.csv")
val_test_df["img_path"] = val_test_df["image"].apply(
    lambda x: DATA_DIR_DIABETIC / "test" / f"{x}.jpeg"
)
all_eyepacs = pd.concat([train_df], ignore_index=True)
all_eyepacs["site"] = 3
all_eyepacs["diagnosis"] = all_eyepacs["level"]
train_id, val_test_id = train_test_split(np.arange(len(all_eyepacs)), train_size=0.40)
val_id, test_id = train_test_split(val_test_id, train_size=0.20)
all_eyepacs.loc[train_id, "split"] = "train"
all_eyepacs.loc[val_id, "split"] = "val"
all_eyepacs.loc[test_id, "split"] = "test"
all_eyepacs

## Create combined RETINA dataset

In [None]:
combined_df = pd.concat([df_a, df_m, all_eyepacs], ignore_index=True)[
    ["diagnosis", "img_path", "site", "split"]
]
combined_df["binary_diagnosis"] = combined_df["diagnosis"].apply(lambda x: x < 2)
combined_df.dropna(subset="diagnosis", inplace=True)
combined_df

## Create train, test, val splits csv

In [None]:
test_df = combined_df.loc[combined_df.split == "test"]
val_df = combined_df.loc[combined_df.split == "val"]
train_df = combined_df.loc[combined_df.split == "train"]

train_df.to_csv(
    PROJECT_ROOT / "data/retina_train.csv",
    index=False,
)
val_df.to_csv(
    PROJECT_ROOT / "data/retina_val.csv", index=False
)
test_df.to_csv(
    PROJECT_ROOT / "data/retina_test.csv", index=False
)

## Print stats

In [None]:
(
    train_df.binary_diagnosis.value_counts(normalize=True),
    train_df.binary_diagnosis.value_counts(normalize=False),
)

In [None]:
(
    test_df.binary_diagnosis.value_counts(normalize=True),
    test_df.binary_diagnosis.value_counts(normalize=False),
)

In [None]:
(
    train_df.site.value_counts(normalize=True),
    val_df.site.value_counts(normalize=True),
    test_df.site.value_counts(normalize=True),
    test_df.site.value_counts(normalize=False),
)