# Dataset Preprocessing

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split

from torchvision import transforms
from torch.utils.data import DataLoader

from src.data.dataset import SkinDataset
from src.utils.config import Config
from src.utils.utils import merge_metadata_label, get_dataset_mean_std

In [None]:
IMAGES_PATH = Config.get_path('images')
GROUND_TRUTH_PATH = Config.get_path('ground_truth_csv')
METADATA_PATH = Config.get_path('metadata_csv')

MERGED_DATASET_PATH = Config.get_path('merged_dataset_csv')
CLEAN_DATASET_PATH = Config.get_path('clean_dataset_csv')

TRAINING_PATH = Config.get_path('training_csv')
TEST_PATH = Config.get_path('test_csv')

For each metadata sample, I add the ``label`` field. This field is taken from the corresponding row of the ground truth CSV. <br/>
I remove ``lesion_id`` attribute from metadata attributes. <br/>

In [None]:
metadata = read_csv(METADATA_PATH, usecols=['image', 'age_approx', 'anatom_site_general', 'sex'])
diagnoses = read_csv(GROUND_TRUTH_PATH)

dataset = merge_metadata_label(metadata, diagnoses)
dataset.to_csv(MERGED_DATASET_PATH, encoding='utf-8', index=False)
print('Size of the Initial Dataset:', len(dataset))

Then, I remove all rows with any empty cells.<br/>
Samples of classes we are not interested in are also removed.

In [None]:
dataset.dropna(subset=['age_approx', 'anatom_site_general', 'sex'], inplace=True)
dataset.index.name = 'index'

for forbidden_label in ['BKL', 'AK', 'VASC']:
    dataset.drop(dataset[dataset['label'] == forbidden_label].index, inplace = True)

dataset.to_csv(CLEAN_DATASET_PATH, encoding='utf-8')
print('Size of the Clean Dataset:', len(dataset))

In order to train the classifier, is necessary to normalize images.<br/>
They must have zero average, and unit standard deviation.

In [None]:
dset = SkinDataset(IMAGES_PATH, CLEAN_DATASET_PATH, transforms.ToTensor())
dataset_loader = DataLoader(dset, pin_memory=True)
mean, std_dev = get_dataset_mean_std(dataset_loader)

print('Dataset images mean:', mean)
print('Dataset images standard deviation:', std_dev)

Now I split the dataset into **Training Set** [85%], and **Test Set** [15%]. <br/>
Obviously, the split is totally random.

In [None]:
tr, te = train_test_split(dataset, train_size=0.85, shuffle=True)
tr.to_csv(TRAINING_PATH, encoding='utf-8')
te.to_csv(TEST_PATH, encoding='utf-8')

print('Training Set Size:', len(tr))
print('Test Set Size:', len(te))