# Preprocessing Notebook

## Preprocessing for single channel data

In [None]:
from glob import glob
import numpy as np
from tifffile import imread
from PIL import Image
import PIL.ImageOps  
import os
from pathlib import Path
import cv2
import csv
import shutil
from shutil import copy,copytree
import fnmatch
import matplotlib.pyplot as plt
Image.MAX_IMAGE_PIXELS = None

In [None]:
# place holder variables, change as appropriate 
protein_name='Dystrophin'
split_image_height= 512
split_image_width= 512
subject_type= 'controls'

In [None]:
# prefixing each file name with subject_IDs- DO NOT Run this if files names are already changed
path= './Dataset'
for root, dirs, files in os.walk(path):
    if not files:
        continue
    prefix = os.path.basename(root)
    for f in files:
        os.rename(os.path.join(root, f), os.path.join(root, "{}_{}".format(prefix, f)))

In [None]:
#Extracting only tiff images with same folder structure
shutil.copytree('./Dataset', './Dataset_TIFF' , ignore=shutil.ignore_patterns('*.jpg', '*.db'))

In [None]:
def Seperate_Folder_per_protein(src, dst, file_ending_with):
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    for root, dirs, files in os.walk(src):
        for file in files:
            if file.endswith(file_ending_with):
                path_file = os.path.join(root,file)
                path_dst = os.path.join(dst,file)
                copy(path_file,dst) 

In [None]:
src= f'./Dataset_TIFF/{subject_type}'
dst= f'./Dataset_TIFF_Dystrophin/{subject_type}/'
file_ending_with= f"{protein_name}.ome.tiff"
Seperate_Folder_per_protein(src, dst, file_ending_with)

In [None]:
def dir_create(path):
    if (os.path.exists(path)) and (os.listdir(path) != []):
        shutil.rmtree(path)
        os.makedirs(path)
    if not os.path.exists(path):
        os.makedirs(path)

def crop(input_file, height, width):
    img = Image.open(input_file)
    img_width, img_height = img.size
    for i in range(img_height//height):
        for j in range(img_width//width):
            box = (j*width, i*height, (j+1)*width, (i+1)*height)
            yield img.crop(box)

def split(inp_img_dir,out_dir, height, width, 
          start_num):
    #image_dir = os.path.join(out_dir, 'images')
    dir_create(out_dir)
    #dir_create(image_dir)
    
    img_list = [f for f in
                os.listdir(inp_img_dir)
                if os.path.isfile(os.path.join(inp_img_dir, f))]
    file_num = 0
    for infile in img_list:
        infile_path = os.path.join(inp_img_dir, infile)        
        for k, piece in enumerate(crop(infile_path,
                                       height, width), start_num):
            img = Image.new('I;16', (height, width), 65535)
            img.paste(piece)
            img_path = os.path.join(out_dir, 
                                    infile.split('_')[0]+ '_UqCRC2'  #change
                                    + str(k).zfill(5) + '.tiff')
            img.save(img_path)
       
        file_num += 1
        sys.stdout.write("\rFile %s was processed." % file_num)
        sys.stdout.flush()

In [None]:
inp_img_dir = f'./Dataset_TIFF_{protein_name}/{subject_type}'  
out_dir = f'./Dataset_TIFF_{protein_name}/{subject_type}'   
height = split_image_height
width = split_image_width
start_num = 1

split(inp_img_dir, out_dir, height, width, start_num)

## Preprocessing for Multichannel data

In [None]:
from patchify import patchify
from glob import glob
import numpy as np
from tifffile import imread,imwrite

In [None]:
# function to merge all channels
def channel_merger(img_list): # img_list need to be list of images of all protein expression images of a subject 
    
    multi_channel_img = np.zeros((*img_list[0].shape,len(img_list)), np.uint8)
    
    for i, img in enumerate(img_list):
        multi_channel_img[:,:,i]= img
        
    return multi_channel_img

In [None]:
new_path_controls = dir_create('./TIFF_Images_Concat/controls/') 
new_path_patients = dir_create('./TIFF_Images_Concat/patients/') 

In [None]:
controls_dirs= [ f.path for f in os.scandir('./Dataset_TIFF/controls/') if f.is_dir() ]
patients_dirs= [ f.path for f in os.scandir('./Dataset_TIFF/patients/') if f.is_dir() ]

In [None]:
for path in controls_dirs:
    X=sorted(glob(path+'/*.tiff'))
    X = list(map(imread,X))
    multi_channel_img= channel_merger(X)
    np.save(str(new_path_controls) + path[-2:]+'_combined.npy' ,multi_channel_img)

In [None]:
for path in patients_dirs:
    X=sorted(glob(path+'/*.tiff'))
    X = list(map(imread,X))
    multi_channel_img= channel_merger(X)
    np.save(str(new_path_patients) + path[-2:]+'_combined.npy' ,multi_channel_img)

In [None]:
# spliting the multichannel array into patches
splited_control_dir= dir_create('./Concat_Split_Images/Controls) 
splited_patient_dir= dir_create('./Concat_Split_Images/Patients) 
                                

In [None]:
#controls
files = glob.glob(str(new_path_controls) + '/**/*.npy', recursive=True)
for file in files:
    img = np.load(file)
    patches_img = patchify(img, (512,512,10), step=256)
    for i in range(patches_img.shape[0]):
        for j in range(patches_img.shape[1]):
            single_patch_img = patches_img[i, j, 0, :, :, :]
            np.save(str(splited_control_dir) + "C0"+str(files.index(file)+1)+'_image_' + '_'+ str(i)+str(j)+'.npy', single_patch_img)

In [None]:
#patients
files = glob.glob(str(new_path_patients) + '/**/*.npy', recursive=True)
for file in files:
    img = np.load(file)
    patches_img = patchify(img, (512,512,10), step=256)
    for i in range(patches_img.shape[0]):
        for j in range(patches_img.shape[1]):
            single_patch_img = patches_img[i, j, 0, :, :, :]
            np.save(str(splited_patient_dir) + "P0"+str(files.index(file)+1)+'_image_' + '_'+ str(i)+str(j)+'.npy', single_patch_img)

In [None]:
import os
import pandas as pd

os.chdir(r'./Concat_Split_Images')

folders = ['Controls', 'Patients']

files = []

for folder in folders:
    for file in os.listdir(folder):
        files.append([str('./Concat_Split_Images/')+folder+str('/')+file, folder])

pd.DataFrame(files, columns=['files', 'target']).to_csv('files_and_targets.csv')