In [None]:
import shutil
import os
import pandas as pd
from tqdm import tqdm

In [None]:
datapath = "../../datasets/tb-wellgen-smear/v1/tb-labels.csv"

In [None]:
data = pd.read_csv(datapath)
data

In [None]:
positive = data[data["tb_positive"]==1]
train_positive = positive.sample(n=2500, random_state=42)

In [None]:
eval_positive = positive.loc[list(set(positive.index).symmetric_difference(set(train_positive.index)))].dropna()
eval_positive

In [None]:
train_positive["split"] = "train"
eval_positive["split"] = "eval"

In [None]:
eval_positive

In [None]:
train_positive

In [None]:
labels = data.tb_positive.tolist()
paths = data.file_path.tolist()
images = data.image.tolist()

In [None]:
negative = data[data["tb_positive"]==0]
eval_negative = negative.sample(n=len(eval_positive), random_state=42)
eval_negative

In [None]:
train_negative = negative.loc[list(set(negative.index).symmetric_difference(set(eval_negative.index)))].dropna()
train_negative

In [None]:
train_negative["split"] = "train"
eval_negative["split"] = "eval"

In [None]:
all_data = pd.concat([train_negative, eval_negative, train_positive, eval_positive])
all_data = all_data.sample(frac=1, random_state=42)
all_data

In [None]:
def create_images_to_dir(dataset_split, data_paths, data_labels, images_names):
    directory = os.path.join('/','home','ngsci','project', 'tb', dataset_split)
    total_data = len(data_paths)
    for index in tqdm(range(total_data), desc ="Data Creation Progress"):
        data_path, data_label, image_name = data_paths[index], data_labels[index], images_names[index]
        output_directory = os.path.join(directory, str(data_label), image_name)
        shutil.copy(data_path, output_directory)

In [None]:
# !ls ../tb/train/1/aug*

In [None]:
train_data = all_data[all_data["split"]=="train"]
eval_data = all_data[all_data["split"]=="eval"]

train_data_paths = train_data.file_path.tolist()
train_data_labels = train_data.tb_positive.tolist()
train_images_names = train_data.image.tolist()

val_data_paths = eval_data.file_path.tolist()
val_data_labels = eval_data.tb_positive.tolist()
val_images_names = eval_data.image.tolist()

In [None]:
create_images_to_dir('train', train_data_paths, train_data_labels, train_images_names)
create_images_to_dir('val', val_data_paths, val_data_labels, val_images_names)

In [None]:
import glob

train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train')
train_slides_fp = os.path.join(train_data_dir,'*','tb*')
train_slides_list = glob.glob(train_slides_fp)
print('Train Images :{}'.format(len(train_slides_list)))

eval_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'val')
eval_slides_fp = os.path.join(eval_data_dir,'*','*')
eval_slides_list = glob.glob(eval_slides_fp)
print('Eval Images :{}'.format(len(eval_slides_list)))

In [None]:
train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train', '1')
train_slides_fp = os.path.join(train_data_dir,'tb*')
train_slides_list = glob.glob(train_slides_fp)

print('Train Positive Images Before Augmentation :{}'.format(len(train_slides_list)))

In [None]:
# import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import glob
import numpy as np

trainAug = ImageDataGenerator(
	rotation_range=90,
	zoom_range=0.15,
	width_shift_range=0.2,
	height_shift_range=0.2,
	shear_range=0.15,
	horizontal_flip=True,
	fill_mode="nearest")

def build_augmented_images():
    directory = os.path.join('/','home','ngsci','project', 'tb', 'train', '1')
    
    for image_index in tqdm(range(2388, len(train_slides_list)), desc='Generating Augmentation Images'):
        image = load_img(train_slides_list[image_index])
        image = img_to_array(image)
        image = np.expand_dims(image, 0)

        # generating images
        
        imageGen = trainAug.flow(image, save_to_dir=directory,
                                 save_prefix ='aug_image_{}'.format(image_index),
                                 save_format ='jpg')

        total = 0
        for image_generated in imageGen:
            if total == 30:
                break
            total += 1
        
        new_train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train', '1')
        new_train_slides_fp = os.path.join(new_train_data_dir,'aug_4*')
        new_train_slides_list = glob.glob(new_train_slides_fp)
        print('Round 4 Augmentated Images :{}'.format(len(new_train_slides_list)))

build_augmented_images()

In [None]:
train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train', '1')
train_slides_fp = os.path.join(train_data_dir,'*')
train_slides_list = glob.glob(train_slides_fp)

print('Train Positive Images After Augmentation :{}'.format(len(train_slides_list)))

In [None]:
train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train', '0')
train_slides_fp = os.path.join(train_data_dir,'*')
train_slides_list = glob.glob(train_slides_fp)

print('Train Negative Images After Augmentation :{}'.format(len(train_slides_list)))

In [None]:
# Data Augmentation ended up not working well. The best LB result is done on non-augmented dataset

In [None]:
import os
import shutil
import glob

new_train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train', '1')
new_train_slides_fp = os.path.join(new_train_data_dir,'aug_*')
new_train_slides_list = glob.glob(new_train_slides_fp)
print('Augmentated Images :{}'.format(len(new_train_slides_list)))

In [None]:
for file_path in new_train_slides_list:
    os.remove(file_path)

In [None]:
new_train_data_dir = os.path.join('/','home','ngsci', 'project', 'tb', 'train', '1')
new_train_slides_fp = os.path.join(new_train_data_dir,'aug_*')
new_train_slides_list = glob.glob(new_train_slides_fp)
print('Augmentated Images :{}'.format(len(new_train_slides_list)))