In [None]:
%matplotlib inline
%autosave 60
import numpy as np
import pandas as pd
from glob import glob
import os
from sklearn.model_selection import train_test_split
import shutil
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator as kerasImageDataGenerator

In [None]:
# DEFINE FUNCTIONAL PARAMETERS
train_again = 0
split_and_organize = 0     # 1 to organize and split all images by label and to split train/validation, 0 if already done

# DEFINE PATHS
IMAGES_PATH = 'dataset\input\*.tif'
LABELS_PATH = r'dataset\train_labels.csv'
TRAINING_PATH = 'training'
VALIDATION_PATH = 'validation'

# DEFINE HYPER-PARAMETERS
TRAINING_BATCH_SIZE = 32
LR_INITIAL = 0.0001
LR_DECAY = 0.00001

In [None]:
# Add path Column                       data_frame = [path]
data_frame = pd.DataFrame({'path': glob(IMAGES_PATH)})
display(data_frame.head())

# Add ID Column (from path)             data_frame = [path, ID]
get_id_from_path = lambda x: x.split('\\')[2].split('.')[0]
data_frame['id'] = data_frame.path.map(get_id_from_path)
data_frame.head(5)

# Add Label column (from label file)    data_frame = [path, ID, label]
labels = pd.read_csv(LABELS_PATH)
data_frame = data_frame.merge(labels, on = 'id')
data_frame.head()

In [None]:
## SPLIT DATASET INTO NEG/POS AND TRAIN/VAL
negatives = data_frame[data_frame['label'] == 0]
positives = data_frame[data_frame['label'] == 1]
display('Num of positive samples: ' + str(len(positives)))
display('Num of negative samples: ' + str(len(negatives)))
negatives = negatives.sample(85000)
positives = positives.sample(85000)
data_frame = pd.concat([negatives, positives]).reset_index()
data_frame = data_frame[['path','id','label']]
data_frame.head()

## ORGANIZE SAMPLES INTO DIRECTORIES
# 1. Create subdirectories
for folder in [TRAINING_PATH, VALIDATION_PATH]:
    for subfolder in ['0', '1']:
        path = os.path.join(folder, subfolder)
            os.makedirs(path, exist_ok=True)
            
# 2. Copy appropriate samples into new organized directories
training, validation = train_test_split(data_frame, train_size=0.9, stratify=data_frame['label'])
data_frame.set_index('id', inplace=True)

n=1
for images_and_path in [(training, TRAINING_PATH), (validation, VALIDATION_PATH)]:
    images = images_and_path[0]
    path = images_and_path[1]
    for image in images['id'].values:
        file_name = image + '.tif'
        label = str(data_frame.loc[image]['label'])
        destination = os.path.join(path, label, file_name)
        if n % 2000 == 0:
            print (str(n) + ' / 170000', end="\r")
        n+=1
        if not os.path.exists(destination):
            source = os.path.join('dataset\input', file_name)
            shutil.copyfile(source, destination)