## This notebook is an example: create a copy before running it or you will get merge conflicts!

This notebook will walk you through the process of normalizing your image data. Before running through the notebook, make sure you've completed section 3 of `1_set_up_toffy.ipynb`, and that your data has already been compensated with rosetta using `4_compensate_image_data.ipynb`

In [1]:
import sys
sys.path.append('../')

import os
import pandas as pd
import skimage.io as io # new

from toffy import normalize
from ark.utils.io_utils import list_files, list_folders

### You'll first need to specify the location of the relevant files to enable image normalization

In [2]:
# First specify the name of the run that you'll be normalizing

# Then provide the path to your panel
panel_path = 'I:\\20220518_TONIC_panel_file.csv'
panel = pd.read_csv(panel_path)

# These paths should point to the folders containing each step of the processing pipeline
bin_base_dir = 'I:\\run_files'
rosetta_base_dir = 'I:\\rosetta'
normalized_base_dir = 'I:\\normalized'
mph_base_dir = bin_base_dir

In [25]:
import shutil
moly_base_dir = 'I:\\moly_run_fovs'
runs = list_folders(rosetta_base_dir)
for run in runs[2:]:
    run_dir = os.path.join(bin_base_dir, run)
    new_dir = os.path.join(moly_base_dir, run)
    os.makedirs(new_dir)
    moly_fovs = json_utils.list_moly_fovs(run_dir)
    for fov in moly_fovs:
        files = list_files(run_dir, fov)
        for file in files:
            shutil.move(os.path.join(run_dir, file), 
                            os.path.join(new_dir, file))
        
        shutil.rmtree(os.path.join(rosetta_base_dir, run, fov))
        shutil.rmtree(os.path.join('I:\\extracted', run, fov))

In [15]:
for run_name in run_names[3:]:
    run_dir = os.path.join(bin_base_dir, run_name)
    previous_pulses = list_files(run_dir, 'pulse')
    for file in previous_pulses:
        os.remove(os.path.join(run_dir, file))

In [6]:
run_names = list_folders(rosetta_base_dir)
run_names[18:30]

['2022-02-21_TONIC_TMA11_run2',
 '2022-02-23_TONIC_TMA12_run1',
 '2022-02-24_TONIC_TMA12_run2',
 '2022-02-25_TONIC_TMA12_run3',
 '2022-02-26_TONIC_TMA13',
 '2022-02-26_TONIC_TMA13_restart',
 '2022-02-28_TONIC_TMA13_run2',
 '2022-03-01_TONIC_TMA14_run1',
 '2022-03-02_TONIC_TMA14_run2',
 '2022-03-03_TONIC_TMA15_run1',
 '2022-03-04_TONIC_TMA15_run2',
 '2022-03-05_TONIC_TMA15_run3']

### Then, we'll loop over each FOV, generating the necessary normalization files if they weren't already created, then normalizing the images, and finally saving them to the output folder

In [None]:
run_names = list_folders(rosetta_base_dir)
#for run_name in run_names[18:30]:
for run_name in ['2022-03-01_TONIC_TMA14_run1a', '2022-03-01_TONIC_TMA14_run1b']:
    print("analyzing run {}".format(run_name))
    # specify sub-folder for rosetta images
    img_sub_folder = 'normalized'

    # create directory to hold normalized images
    normalized_run_dir = os.path.join(normalized_base_dir, run_name)
    if not os.path.exists(normalized_run_dir):
        os.makedirs(normalized_run_dir)

    # create directory to hold associated processing files
    mph_run_dir = os.path.join(mph_base_dir, run_name)
    if not os.path.exists(mph_run_dir):
        os.makedirs(mph_run_dir)

    # get all FOVs
    fovs = list_folders(os.path.join(rosetta_base_dir, run_name), 'fov')

    # loop over each FOV
    for fov in fovs:
        # generate mph values
        mph_file_path = os.path.join(mph_run_dir, fov + '_pulse_heights.csv')
        if not os.path.exists(mph_file_path):
            normalize.write_mph_per_mass(base_dir=os.path.join(bin_base_dir, run_name), output_dir=mph_run_dir, 
                                         fov=fov, masses=panel['Mass'].values, start_offset=0.3, stop_offset=0)
        
    normalize.normalize_image_data(img_dir=os.path.join(rosetta_base_dir, run_name), norm_dir=normalized_run_dir, pulse_height_dir=mph_run_dir,
                               panel_info=panel, img_sub_folder=img_sub_folder, mass_obj_func='poly_2')

In [None]:
run_name = '2022-03-01_TONIC_TMA14_run1b'
normalize.create_fitted_pulse_heights_file(pulse_height_dir=os.path.join(bin_base_dir, run_name), panel_info=panel, 
                                           norm_dir=os.path.join(normalized_base_dir, run_name), mass_obj_func='poly_2')

In [11]:
# create stitched image of before and after
import skimage.io as io
import natsort as ns
from ark.utils import data_utils, load_utils, io_utils
import numpy as np

normalized=True
unnormalized=False

# problematic runs to check

# bad curve fit: 2022-03-01_TONIC_TMA14_run1
run_name = '2022-01-14_TONIC_TMA2_run1'
#run_name = '2022-03-13_TONIC_TMA18_run1'

normalized_run_dir = os.path.join(normalized_base_dir, run_name)
unnormalized_run_dir = os.path.join(rosetta_base_dir, run_name)

folders = io_utils.list_folders(normalized_run_dir, 'fov-')
folders = ns.natsorted(folders)

if normalized:
    # get all channels
    channels = load_utils.load_imgs_from_tree(unnormalized_run_dir,
                                                fovs=folders[:1],
                                                img_sub_folder='normalized', 
                                               dtype='float32').channels.values

    # load and stitch normalized data
    stitch_dir = os.path.join(normalized_run_dir, 'stitched_images_normalized')
    if not os.path.exists(stitch_dir):
        os.makedirs(stitch_dir)

    for chan in channels[1:]:
        img_data = load_utils.load_imgs_from_tree(normalized_run_dir,
                                                fovs=folders,
                                                img_sub_folder='', 
                                               dtype='float32',
                                                 channels=[chan], 
                                                 max_image_size=2048)

        stitched = data_utils.stitch_images(img_data, int(np.floor(np.sqrt(img_data.shape[0]))))


        # save normalized data
        current_img = stitched.loc['stitched_image', :, :, chan].values
        io.imsave(os.path.join(stitch_dir, chan + '.tiff'), current_img.astype('float32'), check_contrast=False)


if unnormalized:

    # load and stitch unnormalized data
    unnormalized_run_dir = os.path.join(rosetta_base_dir, run_name)

    stitch_dir = os.path.join(normalized_run_dir, 'stitched_images_unnormalized')
    if not os.path.exists(stitch_dir):
        os.makedirs(stitch_dir)

    for chan in channels:
        img_data = load_utils.load_imgs_from_tree(unnormalized_run_dir,
                                                fovs=folders,
                                                img_sub_folder='normalized', 
                                               dtype='float32',
                                                 channels=[chan],
                                                 max_image_size=2048)

        stitched = data_utils.stitch_images(img_data, int(np.floor(np.sqrt(img_data.shape[0]))))


        # save normalized data
        current_img = stitched.loc['stitched_image', :, :, chan].values
        io.imsave(os.path.join(stitch_dir, chan + '.tiff'), current_img.astype('float32'), check_contrast=False)

In [17]:
xx = io.imread(os.path.join(normalized_run_dir, 'fov-1-scan-1'

'CD11c'

In [None]:
img_data = load_utils.load_imgs_from_tree(normalized_run_dir,
                                                    fovs=folders[38:39],
                                                    img_sub_folder='', 
                                                   dtype='float32',
                                                     channels=[chan], 
                                                     max_image_size=2048)

In [69]:
pulse_heights = pd.read_csv(os.path.join(mph_base_dir, run_name + '_small_window', 'pulse_heights_combined.csv'))
masses = np.unique(panel['Mass'])
for i in range(1, 10):
        old_name = 'fov-{}-scan-1'.format(i)
        new_name = 'fov-0{}-scan-1'.format(i)
        pulse_heights = pulse_heights.replace(old_name, new_name)

for mass in masses[:1]:
    mass = 175
    pulse_heights = pulse_heights.sort_values('fov')
    fovs = np.unique(pulse_heights['fov'])
    mo_fovs = [fov.split('-scan')[0] for fov in fovs if 'scan-2' in fov]
    complete_mo_fovs = []
    for fov in mo_fovs:
        complete_mo_fovs.append(fov + '-scan-1')
        complete_mo_fovs.append(fov + '-scan-2')
        complete_mo_fovs.append(fov + '-scan-3')
    fovs = [fov for fov in fovs if fov not in complete_mo_fovs]
    pulse_heights = pulse_heights.loc[np.isin(pulse_heights['fov'], fovs), :]

    y = pulse_heights.loc[pulse_heights['mass'] == mass, 'pulse_height'].values
    x = np.linspace(0, len(y) - 1, len(y))


    def reg_func(_x, _y):
        return np.polyval(np.polyfit(_x, _y, 2), np.linspace(0, len(x), len(x)))


    from seaborn import algorithms as algo
    from seaborn.utils import ci
    yhat_boots = algo.bootstrap(pd.Series(x), pd.Series(y), func=reg_func,
                                n_boot=1000, units=None)
    err_bands = ci(yhat_boots, 95, axis=0)

    top_band = err_bands[1]
    outlier_fovs = []
    for idx, val in enumerate(y):
        if val > top_band[idx]:
            outlier_fovs.append(fovs[idx])

    outlier_fovs = [fov.replace('-0', '-') for fov in outlier_fovs]
    
    # create directory to hold stiched images
    out_dir = os.path.join(normalized_base_dir, run_name, 'outlier_fovs_{}'.format(mass))
    os.makedirs(out_dir)
    channel_name = panel.loc[panel['Mass'] == mass, 'Target'].values[0]
    
    img_data = load_utils.load_imgs_from_tree(os.path.join(rosetta_base_dir, run_name),
                                            img_sub_folder='normalized', 
                                            dtype='float32',
                                              fovs=outlier_fovs,
                                             channels=[channel_name],
                                             max_image_size=2048)

    stitched = data_utils.stitch_images(img_data, int(np.floor(np.sqrt(img_data.shape[0]))))


    # save normalized data
    current_img = stitched.loc['stitched_image', :, :, channel_name].values
    io.imsave(os.path.join(out_dir, channel_name + '.tiff'), current_img, check_contrast=False)

In [70]:
outlier_fovs

['fov-3-scan-1',
 'fov-12-scan-1',
 'fov-13-scan-1',
 'fov-16-scan-1',
 'fov-18-scan-1',
 'fov-22-scan-1',
 'fov-30-scan-1',
 'fov-34-scan-1',
 'fov-38-scan-1',
 'fov-45-scan-1',
 'fov-51-scan-1',
 'fov-54-scan-1',
 'fov-55-scan-1']

In [71]:
fovs[-5:]

['fov-50-scan-1',
 'fov-51-scan-1',
 'fov-52-scan-1',
 'fov-54-scan-1',
 'fov-55-scan-1']

In [12]:
# create plots
import os
import pandas as pd
from ark.utils import io_utils

run_names = ['2022-01-14_TONIC_TMA2_run1', '2022-01-21_TONIC_TMA5', '2022-01-26_TONIC_TMA9', '2022-02-26_TONIC_TMA13',
            '2022-03-02_TONIC_TMA14_run2', '2022-03-14_TONIC_TMA18_run3', '2022-04-05_TONIC_TMA20_run1', '2022-04-10_TONIC_TMA22_run1']

for run in run_names:
    run_dir = os.path.join(mph_base_dir, run + '_small_window')
    files = io_utils.list_files(run_dir, '_pulse_heights')

    metrics = []
    for file in files:
        metrics.append(pd.read_csv(os.path.join(run_dir, file)))

    metrics = pd.concat(metrics)

    metrics.to_csv(os.path.join(run_dir, 'pulse_heights_combined.csv'), index=False)

In [72]:
import natsort as ns
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

for run in run_names[:1]:
    run_dir = os.path.join(mph_base_dir, run + '_small_window')
    pulse_heights = pd.read_csv(os.path.join(run_dir, 'pulse_heights_combined.csv'))
    fovs = ns.natsorted(pulse_heights['fov'].unique())
    
    mo_fovs = [fov.split('-scan')[0] for fov in fovs if 'scan-2' in fov]
    complete_mo_fovs = []
    for fov in mo_fovs:
        complete_mo_fovs.append(fov + '-scan-1')
        complete_mo_fovs.append(fov + '-scan-2')
        complete_mo_fovs.append(fov + '-scan-3')
    fovs = [fov for fov in fovs if fov not in complete_mo_fovs]
    pulse_heights = pulse_heights.loc[np.isin(pulse_heights['fov'], fovs), :]
    
    pulse_heights['fov'] = pd.Categorical(pulse_heights['fov'], ordered=True,
                                               categories=ns.natsorted(pulse_heights['fov'].unique()))
    pulse_heights = pulse_heights.sort_values('fov')

    # add numerical column for fovs to enable easier plotting by acq order
    fov_names = pulse_heights['fov'].values.tolist()
    fov_nums = [float(fov_name.split('-')[1]) for fov_name in fov_names]
    pulse_heights['acq_order'] = fov_nums

    for mass in np.unique(pulse_heights['mass'].values):

        plot_data = pulse_heights.loc[pulse_heights['mass'] == mass]

        sns.set_style("whitegrid")
        # g = sns.FacetGrid(data=plot_data, x='acq_order', y='pulse_height')
        sns.regplot(x='acq_order', y='pulse_height', data=plot_data, order=2)

        plot_dir = os.path.join(run_dir, 'mph_v_acq_per_mass')
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)
        plt.savefig(os.path.join(plot_dir, str(mass) + '_mph_vs_acq.pdf'), bbox_inches='tight')
        plt.close()

In [None]:

json_path = os.path.join(bin_base_dir, run_name, 'fov-18-scan-1.json')
with open(json_path, 'r') as jp:
    json_file = json.load(jp)

json_file['standardTarget']