# Create RETINA dataset files

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path
import pandas as pd

path_to_root = "/vol/biomedic3/bglocker/mscproj/zm1224/msc_individual_project/"

import sys

sys.path.append(path_to_root)

from default_paths import MESSIDOR_ROOT, APTOS_ROOT, DIABETIC_ROOT

## Load MESSIDOR dataset

In [2]:
df_m = pd.read_csv(MESSIDOR_ROOT / "messidor_data.csv")
df_m["diagnosis"] = df_m["adjudicated_dr_grade"]
df_m["site"] = 1
df_m["img_path"] = df_m["image_id"].apply(
    lambda image_name: MESSIDOR_ROOT / "IMAGES" / image_name
)
train_id, val_test_id = train_test_split(np.arange(len(df_m)), train_size=0.40)
val_id, test_id = train_test_split(val_test_id, train_size=0.20)
df_m.loc[train_id, "split"] = "train"
df_m.loc[val_id, "split"] = "val"
df_m.loc[test_id, "split"] = "test"
df_m

Unnamed: 0,image_id,adjudicated_dr_grade,adjudicated_dme,adjudicated_gradable,diagnosis,site,img_path,split
0,20051020_43808_0100_PP.png,0.0,0.0,1,0.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/2005...,test
1,20051020_43832_0100_PP.png,1.0,0.0,1,1.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/2005...,test
2,20051020_43882_0100_PP.png,1.0,0.0,1,1.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/2005...,test
3,20051020_43906_0100_PP.png,2.0,1.0,1,2.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/2005...,train
4,20051020_44261_0100_PP.png,0.0,0.0,1,0.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/2005...,test
...,...,...,...,...,...,...,...,...
1743,IM004806.jpg,0.0,0.0,1,0.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/IM00...,train
1744,IM004811.jpg,1.0,0.0,1,1.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/IM00...,train
1745,IM004812.jpg,2.0,0.0,1,2.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/IM00...,train
1746,IM004831.jpg,0.0,0.0,1,0.0,1,/vol/biomedic3/mb121/data/messidor/IMAGES/IM00...,val


## Load APTOS dataset

In [3]:
df_a = pd.read_csv(APTOS_ROOT / "train.csv")
df_a["site"] = 2
df_a["img_path"] = df_a["id_code"].apply(
    lambda image_name: APTOS_ROOT / "train_images" / f"{image_name}.png"
)
df_a
train_id, val_test_id = train_test_split(np.arange(len(df_a)), train_size=0.40)
val_id, test_id = train_test_split(val_test_id, train_size=0.20)
df_a.loc[train_id, "split"] = "train"
df_a.loc[val_id, "split"] = "val"
df_a.loc[test_id, "split"] = "test"
df_a

Unnamed: 0,id_code,diagnosis,site,img_path,split
0,000c1434d8d7,2,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,test
1,001639a390f0,4,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,val
2,0024cdab0c1e,1,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,train
3,002c21358ce6,0,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,test
4,005b95c28852,0,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,test
...,...,...,...,...,...
3657,ffa47f6a7bf4,2,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,test
3658,ffc04fed30e6,0,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,test
3659,ffcf7b45f213,2,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,test
3660,ffd97f8cd5aa,0,2,/vol/biomedic3/mb121/data/aptos2019/train_imag...,train


## Load EyePACS dataset

In [4]:
train_df = pd.read_csv(DIABETIC_ROOT / "trainLabels.csv")
train_df["img_path"] = train_df["image"].apply(
    lambda x: DIABETIC_ROOT / "train" / f"{x}.jpeg"
)
val_test_df = pd.read_csv(DIABETIC_ROOT / "retinopathy_solution.csv")
val_test_df["img_path"] = val_test_df["image"].apply(
    lambda x: DIABETIC_ROOT / "test" / f"{x}.jpeg"
)
all_eyepacs = pd.concat([train_df], ignore_index=True)
all_eyepacs["site"] = 3
all_eyepacs["diagnosis"] = all_eyepacs["level"]
train_id, val_test_id = train_test_split(np.arange(len(all_eyepacs)), train_size=0.40)
val_id, test_id = train_test_split(val_test_id, train_size=0.20)
all_eyepacs.loc[train_id, "split"] = "train"
all_eyepacs.loc[val_id, "split"] = "val"
all_eyepacs.loc[test_id, "split"] = "test"
all_eyepacs

PermissionError: [Errno 13] Permission denied: '/vol/biodata/data/diabetic_retino/trainLabels.csv'

## Create combined RETINA dataset

In [None]:
combined_df = pd.concat([df_a, df_m, all_eyepacs], ignore_index=True)[
    ["diagnosis", "img_path", "site", "split"]
]
combined_df["binary_diagnosis"] = combined_df["diagnosis"].apply(lambda x: x < 2)
combined_df.dropna(subset="diagnosis", inplace=True)
combined_df

## Create train, test, val splits csv

In [None]:
test_df = combined_df.loc[combined_df.split == "test"]
val_df = combined_df.loc[combined_df.split == "val"]
train_df = combined_df.loc[combined_df.split == "train"]

train_df.to_csv(
    Path(path_to_root) / "experiments" / "retina_train.csv",
    index=False,
)
val_df.to_csv(Path(path_to_root) / "experiments" / "retina_val.csv", index=False)
test_df.to_csv(Path(path_to_root) / "experiments" / "retina_test.csv", index=False)

## Print stats

In [None]:
(
    train_df.binary_diagnosis.value_counts(normalize=True),
    train_df.binary_diagnosis.value_counts(normalize=False),
)

In [None]:
(
    test_df.binary_diagnosis.value_counts(normalize=True),
    test_df.binary_diagnosis.value_counts(normalize=False),
)

In [None]:
(
    train_df.site.value_counts(normalize=True),
    val_df.site.value_counts(normalize=True),
    test_df.site.value_counts(normalize=True),
    test_df.site.value_counts(normalize=False),
)