# Pre-Start Procedures

## Import Statements (Always Run)

In [1]:
# customary imports:
import os
import re
import datetime
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

## *If Running In Colab* - Sets Current Working Dir to Your Google Drive Folder

In [None]:
your_drive_dir = 'YourGoogleDriveDirectoryPath'
os.chdir('/content/drive/' + your_drive_dir)
print("Current Working Directory is : " + os.getcwd())
import preprocess_crop

# Data Preprocessing - Do not run if data folders already established

## Function to Convert .mat folder into Another File Format and Reject Unwanted Data

In [2]:
# Import Statements
from utils.data_preprocessing_utils import convert_MAP

In [4]:
'''
This function rejects data under specified size requirement and 
data that is noisy / has little valuable info (uses mean and std).
This function can also perform contrast enhancement.
'''
raw_directory = 'raw_data'
MIN_SHAPE = (128,128)
ENDPOINT_FOLDER_NAME = 'converted_data'
print('Starting Data Convertion / Automated Rejection...\n')
converted_dir = convert_MAP(raw_directory, ENDPOINT_FOLDER_NAME, MIN_SHAPE,
                            file_format = '.tif', search_keys = ['map_532', 'map_all', 'DepMap_532', 'DepMap_all'],
                            remove_noisy = True)
print('\nEnding Data Convertion / Automated Rejection')

Starting Data Convertion / Automated Rejection...

Min Size Not Met: Datapoint Rejected -> raw_data/190309_brain  6_Image3.mat
Min Size Not Met: Datapoint Rejected -> raw_data/190309_brain  6_Image4.mat
Min Size Not Met: Datapoint Rejected -> raw_data/20190508_thinnedskull_Epi_by 2    3_Image1.mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_1 (10).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_1 (12).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_1 (38).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_2 (10).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_2 (36).mat
Min Size Not Met: Datapoint Rejected -> raw_data/reslt_OR_3 (15).mat
Min Size Not Met: Datapoint Rejected -> raw_data/reslt_OR_3 (17).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_3 (23).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_3 (24).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_OR_3 (28).mat
Noisy Image: Datapoint Rejected -> raw_data/reslt_O

## Manual Data Rejection

After initial data cleaning, we manually went through the images  
and removed any that did not have clear vascular structures. 

In [16]:
# Import Statements:
from utils.data_preprocessing_utils import transfer_files_except

In [17]:
MANUAL_REMOVAL_LIST = ['reslt_OR_1 (35)_index0', 'reslt_OR_1 (36)_index0', 'reslt_OR_1 (37)_index0', 'reslt_OR_2 (8)_index0', 
                       'reslt_OR_2 (25)_index0', 'reslt_OR_2 (29)_index0', 'reslt_OR_2 (30)_index0', 'reslt_OR_2 (31)_index0',
                       'reslt_OR_2 (34)_index0', 'reslt_OR_2 (35)_index0', 'reslt_OR_2 (38)_index0', 'reslt_OR_2 (40)_index0', 
                       'reslt_OR_3 (25)_index0', 'reslt_OR_3 (26)_index0', 'reslt_OR_3 (31)_index0', 'reslt_OR_3 (32)_index0', 
                       'reslt_OR_3 (33)_index0', 'reslt_OR_4 (21)_index0', 'reslt_OR_4 (26)_index0', 'reslt_OR_4 (27)_index0',
                       'reslt_OR_4 (28)_index0', 'reslt_OR_5 (19)_index0', 'reslt_OR_5 (20)_index0', 'reslt_OR_5 (23)_index0', 
                       'reslt_OR_5 (24)_index0', 'reslt_OR_5 (25)_index0', 'reslt_OR_6 (14)_index0', 'reslt_OR_6 (15)_index0', 
                       'reslt_OR_6 (18)_index0', 'reslt_OR_6 (19)_index0', 'reslt_OR_6 (20)_index0', 'reslt_OR_7 (11)_index0', 
                       'reslt_OR_7 (14)_index0', 'reslt_OR_7 (15)_index0', 'reslt_OR_7 (17)_index0', 'reslt_OR_7 (18)_index0', 
                       'reslt_OR_7 (19)_index0', 'reslt_OR_8 (13)_index0', 'reslt_OR_8 (14)_index0', 'reslt_OR_8 (16)_index0', 
                       'reslt_OR_8 (17)_index0', 'reslt_OR_8 (18)_index0']

In [18]:
file_format = '.png'
MANUAL_REMOVAL_LIST  = [file+file_format for file in MANUAL_REMOVAL_LIST]
input_dir = 'converted_data'
output_dir = 'refined_data'
print('Starting Manual Rejection...')
transfer_files_except(input_dir, output_dir, exception_list = MANUAL_REMOVAL_LIST)
print('Ending Manual Rejection')

Starting Manual Rejection...
Ending Manual Rejection


## Function to Process Data by Denoising and Increasing Contrast

In [19]:
# Import Statements:
from utils.data_preprocessing_utils import data_clean_func
from utils.data_preprocessing_utils import data_cleaning

In [20]:
input_data_dir = 'refined_data'
output_dir = 'cleaned_data'
threshold = (10.0, 100.0) # Percentiles
hist_eq = True 
delete_previous = True
print('Starting Data Cleaning...')
data_cleaning(input_dir = input_data_dir, output_dir_name = output_dir,
              output_file_format ='.jpg', threshold = threshold, 
              hist_eq = hist_eq, delete_previous = delete_previous)
print('Ending Data Cleaning')

Starting Data Cleaning...
Ending Data Cleaning


## Function to Seperate Folder into Subfolder of /training /validation (and /testing - optional)

In [21]:
# Import Statements:
from utils.data_preprocessing_utils import data_seperation

In [22]:
input_dir = 'cleaned_data'
output_dir = 'data_clean'
delete_previous = True
file_format = '.jpeg'
dataset_percentages = (90, 10)
random_state = 7

print('Starting Data Seperation...')
train_dir, val_dir = data_seperation(input_dir, output_dir, dataset_percentages, 
                                     delete_previous, file_format, random_state)
print('Ending Data Seperation')

Starting Data Seperation...
Ending Data Seperation


### Code Block to Show How Much of Data has been Placed in various Subfolders (train/val/test)

In [None]:
print('Length of training directory = '+str(len(os.listdir(train_dir))))
print('Length of validation directory = '+str(len(os.listdir(val_dir))))
try:
    print('Length of test directory = '+str(len(os.listdir(test_dir))))

# Data Loading

## Padding Training Data and Adding Downsampled Image as Channel

In [6]:
# Import Statements:
from utils.standardize_dir_utils import pad_img_and_add_down_channel
from utils.standardize_dir_utils import standardize_dir

In [4]:
main_dir = 'data'
STANDARD_IMAGE_SHAPE = (128,128,1)
downsample_axis = 'both'
downsample_ratio = [1,5]
file_format = '.tif'

In [12]:
input_dir = main_dir + '/train/input'
total_train_image_pool = standardize_dir(input_dir, downsample_axis, downsample_ratio, 
                                         STANDARD_IMAGE_SHAPE, file_format)
print('total_train_image_pool = ' + str(total_train_image_pool))

total_train_image_pool = 304


In [13]:
input_dir = main_dir + '/val/input'
total_val_image_pool = standardize_dir(input_dir, downsample_axis, downsample_ratio, 
                                       STANDARD_IMAGE_SHAPE, file_format)
print('total_val_image_pool = ' + str(total_val_image_pool))

total_val_image_pool = 39


In [14]:
input_dir = main_dir + '/test/input'
total_test_image_pool = standardize_dir(input_dir, downsample_axis, downsample_ratio, 
                                        STANDARD_IMAGE_SHAPE, file_format)
print('total_test_image_pool = ' + str(total_test_image_pool))

total_test_image_pool = 38


# Standardizing Image Size Using Uniform Patches:

In [3]:
from utils.patch_utils import save_patches

In [18]:
save_patches((128,128), input_dir='./data/test/input', output_dir='./data_patches/valid', file_format='.jpeg', delete_previous=True)