# Preprocessing Shared Drive Data

This is a colab notebook for converting the Google Drive nested folder format into a multi-dimensional array per embryo.

In [33]:
# Set the indices to process or None to process everything
START = None
STOP = None
# START = 10
# STOP = 20

In [18]:
# https://github.com/googlecolab/colabtools/issues/1494
!sed -i -e 's/enforce_single_parent:true/enforce_single_parent:true,metadata_cache_reset_counter:4/' /usr/local/lib/python3.6/dist-packages/google/colab/drive.py
from google.colab import drive
import importlib
_ = importlib.reload(drive)

In [19]:
import os
import sys
import time
import pandas as pd
import numpy as np
from matplotlib import image

In [20]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [26]:
shared_path = '/content/gdrive/Shared drives/Embryo_data'

embryo_data = pd.read_excel(f'{shared_path}/embryo_info_CS101.xlsx')

data_labels = ['data/fluo_data', 'data/bf_data']
processed_labels = ['processed/fluo_data', 'processed/bf_data']
polarized_label = 'processed/polarization'


In [27]:
# Copied from utils

def get_z_slice(z, img):
    assert len(img.shape) == 4
    return img[z, :, :, :]

def get_img_at_t(t, img):
    assert len(img.shape) == 4
    return img[:, :, :, t]

def normalize(img):
    """ Normalizes pixel values across all images in img
    to range 0-1.
    """
    assert len(img.shape) == 4

    temp = img - np.min(img)
    if np.max(temp) != 0:
        b = temp / np.max(temp)
    else:
        b = temp
    return b

def middle_z(img):
    assert len(img.shape) == 4

    zdim, xdim, ydim, tdim = img.shape
    result = np.empty(shape=(1, xdim, ydim, tdim))
    result[0] = get_z_slice(int(img.shape[0] / 2), img)
    return result

def max_across_z(img, normalize=False):
    """ Returns a new image where each pixel
    intensity is the maximum for that pixel across
    all images in the z-stack. 
    """

    if normalize:
        img = normalize(img)

    zdim, xdim, ydim, tdim = img.shape
    result = np.empty(shape=(1, xdim, ydim, tdim))

    result[0] = np.amax(img, axis=0)
    return result

def min_across_z(img, normalize=False):
    """ Returns a new image where each pixel
    intensity is the minimum for that pixel across
    all images in the z-stack. 
    """

    if normalize:
        img = normalize(img)

    zdim, xdim, ydim, tdim = img.shape
    result = np.empty(shape=(1, xdim, ydim, tdim))

    result[0] = np.amin(img, axis=0)
    return result

def avg_across_z(img, normalize=False):
    """ Returns a new image where each pixel
    intensity is the average for that pixel across 
    all images in the z-stack. 
    """

    if normalize:
        img = normalize(img)

    zdim, xdim, ydim, tdim = img.shape
    result = np.empty(shape=(1, xdim, ydim, tdim))

    result[0] = np.mean(img, axis=0)
    return result

In [28]:
def get_c_dir(embryo_idx, t, c):
    return f'{shared_path}/Embryo{embryo_idx}/t{t}/c{c}'

def get_png_path(embryo_idx, t, c, z):
    return f'{get_c_dir(embryo_idx, t, c)}/c{c}z{z}t{t}.png'

def process_embryo(embryo_idx, t_num, c_fluo, c_bf):
    print(f'starting {embryo_idx} data')
    embryo_dir = f'{shared_path}/Embryo{embryo_idx}'

    # axes per video will be (z, t, x, y) -> (z, x, y, t)
    for i, c in enumerate([c_fluo, c_bf]):
        output_path = f'{shared_path}/{data_labels[i]}/embryo{embryo_idx}.npy'
        if os.path.exists(output_path):
            continue
        video = []
        max_z = len(os.listdir(get_c_dir(embryo_idx, 1, c)))
        for z in range(1, max_z+1):
            z_data = []
            for t in range(1, t_num+1):
                filename = get_png_path(embryo_idx, t, c, z)
                img_arr = image.imread(filename)
                z_data.append(img_arr)
            video.append(np.array(z_data))
        video = np.array(video)
        old_shape = video.shape
        video = np.moveaxis(video, 1, -1)
        # print(f'old shape = {old_shape}, new shape = {video.shape}')
        np.save(output_path, video)

    print(f'finished {embryo_idx} data')    

In [29]:
def postprocess_embryo(embryo_idx, t_num, first_pol_idx):
    print(f'starting {embryo_idx} processed')
    for dir, func in [('middle', middle_z), ('max', max_across_z), ('min', min_across_z), ('avg', avg_across_z)]:
        for i in range(2):
            input_path = f'{shared_path}/{data_labels[i]}/embryo{embryo_idx}.npy'
            output_path = f'{shared_path}/{processed_labels[i]}/{dir}/embryo{embryo_idx}.npy'
            if os.path.exists(output_path):
                continue
            np.save(output_path, func(np.load(input_path)))
    np.save(f'{shared_path}/{polarized_label}/embryo{embryo_idx}.npy', [0]*(first_pol_idx-1) + [1]*(t_num-first_pol_idx+1))
    print(f'finished {embryo_idx} processed')    

In [32]:
if not START:
    START = 0
    STOP = len(embryo_data)
for pd_idx in range(START, STOP):
    if all(embryo_data[['if_full_injected', 'fluo_quality_of_z_max_sum', 'fluo_quality_of_raw_png', 'if_healthy']].values[pd_idx]):
        embryo_idx, t_num, c_fluo, c_bf, first_pol_idx = embryo_data[["embryo_index", "t_num", "fluo_channel", "DIC_channel", "first_anno_pol_time"]].values[pd_idx]
        process_embryo(embryo_idx, t_num, c_fluo, c_bf)
        postprocess_embryo(embryo_idx, t_num, first_pol_idx)
print(f'there are {len(args)} total embryos')

starting 39 data
finished 39 data
starting 39 processed
finished 39 processed
starting 40 data
finished 40 data
starting 40 processed
finished 40 processed
starting 42 data
finished 42 data
starting 42 processed


KeyboardInterrupt: ignored