# Creating training set

This notebooks generates series of 32x32x32 volumes of both image and mask data of vesicles to create a training set for machine learning. 

In [None]:
import os, re, glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import skimage.io as io

from skimage.morphology import label as label
from skimage.measure import regionprops as regprop
from skimage.morphology import disk, binary_dilation

import mrcfile

In [None]:
# #define location of data
# server_path = '/Volumes/synaptosome/pyto/tomo/'
# server_path = '/mnt/data/amin/Data/tomo/'
# #define location where to save the training data 
# folder_to_save = '/Users/gw18g940/Desktop/Test_data/Zuber/multi_set_training/'
# folder_to_save = '/mnt/data/amin/Data/train_dataset_nonad/'

In [None]:
# server_path = '/mnt/data/amin/Handpicked/'
server_path = '/mnt/data/Amin/tomo/'
folder_to_save = '/mnt/data/Amin/Data/train_dataset_32_synaptasome_1024/'

In [None]:
#find all folders containing data
# folders = glob.glob(server_path+'*ctrl*')
folders = glob.glob(server_path+'*')
print(len(folders))

In [None]:
folders

In [None]:
#in all folders, load both .rec and .mrc files. Use the cell contour to define a region to consider 
#for extraction of data (empty regions are not interesting)
ind =0
target_count= 1024*1
stride= 32

for f in folders:
    
    print(f)
    if os.path.exists(f+'/labels-16bit.mrc'):
        print(ind)
        # print(glob.glob(f+'/*.rec.nad')[0])
        
        imagefile = mrcfile.open(glob.glob(f+'/*.rec.nad')[0])
        maskfile = mrcfile.open(f+'/labels-16bit.mrc')

        image = imagefile.data
        mask = maskfile.data
        
        mask = mask >= 10
        
        # fig, ax = plt.subplots(figsize=(5,5))
        # plt.imshow(image[100,:,:],cmap = 'gray')
        # plt.imshow(mask[100,:,:], cmap = 'Reds', alpha = 0.1)
        # plt.show()
        
        #dilate the cell mask 
        mask2d = binary_dilation(np.sum(mask,axis = 0)>0,disk(20))>0
        
        #split the volume into 32x32x32 volumes. Keep only volumes occupied 
        #by a sufficient amount of vesicles (1000 voxels)
        for z in np.arange(0,image.shape[0]-stride,stride):
            for x in np.arange(0,image.shape[1]-stride,stride):
                for y in np.arange(0,image.shape[2]-stride,stride):
                    if np.sum(mask[z:z+stride,x:x+stride,y:y+stride])>target_count:
                        np.save(folder_to_save+'image_'+str(ind)+'.npy',
                       image[z:z+stride, x:x+stride,y:y+stride].astype(np.float32))

                        np.save(folder_to_save+'mask_'+str(ind)+'.npy',
                       mask[z:z+stride, x:x+stride,y:y+stride].astype(np.float32))
                        ind+=1
print(ind-1)

In [None]:
index = 2

image = np.load(folder_to_save+'image_'+str(index)+'.npy')
mask = np.load(folder_to_save+'mask_'+str(index)+'.npy')
plt.imshow(image[25,:,:], cmap = 'gray')
plt.imshow(mask[25,:,:], cmap = 'Reds', alpha = 0.2)
plt.show()


In [None]:
plt.imshow(np.sum(mask[:,:,:],axis = 0), cmap = 'Reds')
plt.show()


# Creating training set for 2D networks

In [None]:
# server_path = '/mnt/data/amin/Handpicked/'
server_path = '/mnt/data/Amin/tomo/'
folder_to_save = '/mnt/data/Amin/Data_latest/train_dataset_1axes_2d_64_synaptasome_128/'
# folder_to_save = '/media/amin/mtwo/train_dataset_2d_128_synaptasome_512/'

In [None]:
#find all folders containing data
# folders = glob.glob(server_path+'*ctrl*')
folders = glob.glob(server_path+'*')
folders

In [None]:
#in all folders, load both .rec and .mrc files. Use the cell contour to define a region to consider 
#for extraction of data (empty regions are not interesting)
ind =0
num_validation_data = 2
target_count= 128

for i, f in enumerate(folders):
    print(f)
    if os.path.exists(f+'/labels-16bit.mrc'):
        print(ind)
        print(glob.glob(f+'/*.rec.nad')[0])
        
        imagefile = mrcfile.open(glob.glob(f+'/*.rec.nad')[0])
        maskfile = mrcfile.open(f+'/labels-16bit.mrc')

        image = imagefile.data
        mask = maskfile.data
        
        mask = mask >= 10
        
        # fig, ax = plt.subplots(figsize=(5,5))
        # plt.imshow(image[100,:,:],cmap = 'gray')
        # plt.imshow(mask[100,:,:], cmap = 'Reds', alpha = 0.1)
        # plt.show()
        
        #dilate the cell mask 
        # mask2d = binary_dilation(np.sum(mask,axis = 0)>0,disk(20))>0
        
        #split the volume into 32x32x32 volumes. Keep only volumes occupied 
        #by a sufficient amount of vesicles (1000 voxels)
        stride=64
        for z in np.arange(0,image.shape[0]):
            for x in np.arange(0,image.shape[1]-stride,stride):
                for y in np.arange(0,image.shape[2]-stride,stride):
                    if np.sum(mask[z,x:x+stride,y:y+stride])>target_count:
                        np.save(folder_to_save+'image_'+str(ind)+'.npy',
                       image[z, x:x+stride,y:y+stride].astype(np.float32))

                        np.save(folder_to_save+'mask_'+str(ind)+'.npy',
                       mask[z, x:x+stride,y:y+stride].astype(np.float32))
                        ind+=1
        if i == (len(folders) - 1):
            print("Val Datasize: ", ind - train_dataset_size)
        if i == (len(folders) - num_validation_data - 1):
            print("Train Datasize: ",ind)
            train_dataset_size = ind
        if True or i > (len(folders) - num_validation_data - 1):
            print(i,f)
            continue
        for y in np.arange(0,image.shape[2]):
            for x in np.arange(0,image.shape[1]-stride,stride):
                for z in np.arange(0,image.shape[0]-stride,stride):
                    if np.sum(mask[z:z+stride,x:x+stride,y])>target_count:
                        np.save(folder_to_save+'image_'+str(ind)+'.npy',
                       image[z:z+stride,x:x+stride,y].astype(np.float32))

                        np.save(folder_to_save+'mask_'+str(ind)+'.npy',
                       mask[z:z+stride,x:x+stride,y].astype(np.float32))
                        ind+=1
        for x in np.arange(0,image.shape[1]):
            for z in np.arange(0,image.shape[0]-stride,stride):
                for y in np.arange(0,image.shape[2]-stride,stride):
                    if np.sum(mask[z:z+stride,x,y:y+stride])>target_count:
                        np.save(folder_to_save+'image_'+str(ind)+'.npy',
                       image[z:z+stride,x,y:y+stride].astype(np.float32))

                        np.save(folder_to_save+'mask_'+str(ind)+'.npy',
                       mask[z:z+stride,x,y:y+stride].astype(np.float32))
                        ind+=1
        if i == (len(folders) - num_validation_data - 1):
            print("Train Datasize: ",ind)
            train_dataset_size = ind