In [None]:
from functools import partial

import PIL
import numpy as np
import pandas as pd

import torch

import data_augmentation as aug
import robustdg_modified.dataset as dataset
import robustdg_modified.config as cfg

torch.__version__

In [None]:
import sys
print(sys.version)
print(f"Num GPUs Available: {torch.cuda.device_count()}")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_device = torch.device(device)
torch_device

## Reproducibility

In [None]:
SEED = 1

data_loader_generator = torch.Generator()
cfg.reproducibility.seed_everything(SEED, data_loader_generator)

## Dataset

### Unbalanced Dataset

In [None]:
labels_csv = pd.read_csv(cfg.paths.LABELS_CSV["train"])
domain_csv = pd.read_csv(cfg.paths.DOMAIN_TRAIN_CSV["train"])

In [None]:
img_names = dataset.read.get_image_names(labels_csv)
img_labels = dataset.read.get_one_hot_labels(labels_csv)
img_domain = dataset.read.get_one_hot_domain(domain_csv)

### Dataset information

In [None]:
CLASSES = dataset.utils.metadata.get_one_hot_encoded_names(img_labels)
DOMAINS = dataset.utils.metadata.get_one_hot_encoded_names(img_domain)

CLASSES, DOMAINS

In [None]:
IMG_CHANNELS, IMG_HEIGHT, IMG_WIDTH = dataset.utils.metadata.get_image_dimensions(cfg.paths.IMG_DIR["train"])
IMG_CHANNELS, IMG_HEIGHT, IMG_WIDTH

### Domain information

In [None]:
diagnosis = dataset.utils.one_hot_encoding.convert_one_hot_df_to_names(img_labels, "diagnosis")
diagnosis_method = dataset.utils.one_hot_encoding.convert_one_hot_df_to_names(img_domain, "diagnosis_confirm_type")
img_information = pd.concat([img_names, diagnosis, diagnosis_method], axis=1)

imgs_per_domain_label = aug.get_information_per_domain_label(
    img_information, 
    column_names = ["image", "diagnosis_confirm_type", "diagnosis"]
)

### Desired sizes

In [None]:
imgs_per_domain_label["size"].quantile([0.45, 0.5, 0.55])

In [None]:
# Based off of this we can use the interval: [290, 430]
map_to_interval_fn = partial(aug.map_values_proportionally_to_interval, interval=(290, 430))

desired_count = (
    imgs_per_domain_label["size"]
    .groupby("diagnosis_confirm_type", group_keys=False)
    .apply(map_to_interval_fn)
)
imgs_per_domain_label = pd.concat([imgs_per_domain_label, desired_count.rename("desired size")], axis=1)
imgs_per_domain_label

## Data Augmentation

In [None]:
# We are gonna store new names here
data_augmented_information = imgs_per_domain_label[["image"]].copy()

In [None]:
to_reduce = imgs_per_domain_label[imgs_per_domain_label["size"] >= imgs_per_domain_label["desired size"]]
to_augment = imgs_per_domain_label[imgs_per_domain_label["size"] < imgs_per_domain_label["desired size"]]

In [None]:
try:
    cfg.paths.IMG_DIR["augmented_train"].mkdir(parents=True, exist_ok=False)
except OSError:
    raise Exception(
        "Directory already exists.\n"
        "If you want to use it to store only desired images,"
        "you should delete the entire folder and then run this cell again."
    )

### Reduce

In [None]:
# Partial function
copy_imgs_fn = partial(
    aug.copy_all_imgs,
    from_dir = cfg.paths.IMG_DIR["train"],
    to_dir = cfg.paths.IMG_DIR["augmented_train"],
    img_extension = "jpg"
)

In [None]:
for (domain, label), (img_names, size, desired_size) in to_reduce.iterrows():

    desired_img_names = np.random.choice(img_names, size=desired_size, replace=False)

    data_augmented_information.loc[(domain, label), "image"] = desired_img_names
    copy_imgs_fn(img_names=desired_img_names)

### Augment

In [None]:
import torchvision.transforms as T

augmentation_fn = T.Compose(
    [
        T.RandomApply([
            T.CenterCrop(
                [int(0.90 * IMG_HEIGHT), int(0.90 * IMG_WIDTH)]
            )
            ], p=0.25
        ),
        T.RandomHorizontalFlip(p=0.50),
        T.RandomVerticalFlip(p=0.50),
        T.RandomRotation((0, 360)),
        T.RandomAdjustSharpness(sharpness_factor=2, p=0.50)
    ]
)

# Partial function
augment_imgs_fn = partial(
    aug.augment_all_imgs,
    from_dir = cfg.paths.IMG_DIR["train"],
    augmentation = augmentation_fn,
    to_dir = cfg.paths.IMG_DIR["augmented_train"],
    img_extension = "jpg",
)

In [None]:
for (domain, label), (img_names, size, desired_size) in to_augment.iterrows():

    new_img_names = []
    
    copy_imgs_fn(img_names=img_names)
    new_img_names.extend(img_names)
    
    num_extra = desired_size % size
    extra_imgs_names = np.random.choice(img_names, size=num_extra, replace=False)
    augment_imgs_fn(img_names=extra_imgs_names, suffix=f"_aug{0}")
    new_img_names.extend(map(lambda s: s + f"_aug{0}", extra_imgs_names))  

    num_iterations = desired_size // size
    for i in range(1, num_iterations):
        augment_imgs_fn(img_names=img_names, suffix=f"_aug{i}")
        new_img_names.extend(map(lambda s: s + f"_aug{i}", img_names))  
    
    data_augmented_information.loc[(domain, label), "image"] = np.array(new_img_names)

## Save .csv's

In [None]:
data_augmented_information["image"].apply(len) == imgs_per_domain_label["desired size"]

In [None]:
csvs = data_augmented_information.explode("image").sort_values("image").reset_index()
csvs

In [None]:
# Labels are one hot encoded
# Indexing at CLASSES gets the correct order
one_hot = pd.get_dummies(csvs.set_index("image")["diagnosis"])[CLASSES].reset_index()
one_hot.to_csv(cfg.paths.LABELS_CSV["augmented_train"], index=False)

In [None]:
# Domain is just the name
csvs[["image", "diagnosis_confirm_type"]].to_csv(cfg.paths.DOMAIN_TRAIN_CSV["augmented_train"], index=False)