### This notebook runs FlowSOM pixel-level clustering

In [None]:
# import required packages
from datetime import datetime as dt
import os
import subprocess

import numpy as np
import pandas as pd
import xarray as xr

from ark.phenotyping import som_utils
from ark.utils import io_utils, load_utils

### Set file paths and parameters

* base_dir: the path to all of your imaging data. Should contain a directory for your images, segmentations, and cell table (generated from Segment_Image_Data.ipynb).
* tiff_dir: the path to the directory containing your imaging data
* segmentation_dir: the path to the directory containing your segmentations (generated from Segment_Image_Data.ipynb)

In [None]:
base_dir = "../data/granulomaCohort_allData"
tiff_dir = os.path.join(base_dir, "TIFs")
segmentation_dir = "../data/granulomaCohort_allData"

* preprocessed_dir: the name of the directory to save the preprocessed pixel data (placed in base_dir)
* subsetted_dir: the name of the directory to save the subsetted pixel data (placed in base_dir)

In [None]:
preprocessed_dir = 'pixel_mat_preprocessed'
subsetted_dir = 'pixel_mat_subsetted'

* MIBItiff: if the images in tiff_dir are mibitiff or not
* mibitiff_suffix (required if MIBItiff is True): the file suffix all mibitiff images contain

In [None]:
MIBItiff = False
mibitiff_suffix = '-MassCorrected-Filtered.tiff'

* fovs (optional): set a specific set of fovs to load, default loads all the fovs in tiff_dir

In [None]:
# either get all fovs in the folder...
if MIBItiff:
    fovs = io_utils.list_files(tiff_dir, substrs=MIBItiff_suffix)
else:
    fovs = io_utils.list_folders(tiff_dir)

# ... or optionally, select a specific set of fovs manually
# fovs = ["fov14"]

### Preprocess

* channels: set a subset to run pixel clustering over

In [None]:
channels = ["CD45", "SMA", "Vimentin", "CD31", "Keratin-pan", "E-cadherin", "MastChyTry",
            "MPO", "CD20", "CD3", "CD14", "HLA-DR-DQ-DP", "Foxp3", "CD16", "CD11c", "CD206",
            "CD11b", "CD68", "CD163", "CD209"]

Set the following arguments

* blur_factor: the sigma to use for the Gaussian filter when running the Gaussian blur. Higher values are more aggressive in removing noise.
* subset_proportion: the percentage of pixels to take from each fov. Sampling is random.

In [None]:
blur_factor = 2
subset_proportion = 0.1

In [None]:
# run pixel data preprocessing
som_utils.create_pixel_matrix(
    fovs,
    channels,
    base_dir,
    tiff_dir,
    segmentation_dir,
    pre_dir=preprocessed_dir,
    sub_dir=subsetted_dir,
    is_mibitiff=MIBItiff,
    blur_factor=blur_factor,
    subset_proportion=subset_proportion
)

### Train pixel SOM

Set the paths to your pixel clustering data (all paths are placed in base_dir):

* pixel_clustered_dir: directory to store pixel data with just the SOM cluster assignments
* pixel_consensus_dir: directory to store pixel data with the SOM and consensus cluster assignments
* pixel_weights_name: file name to place the pixel SOM weights

If not set, the variables will default to a file path with the timestamp of the start of the run appended at the end (ex. pixel_mat_clustered_{timestamp})

In [None]:
# define the pixel SOM cluster, consensus cluster paths, and pixel SOM weights paths, explicitly set the variables to use custom names
pixel_clustered_dir = None
pixel_consensus_dir = None
pixel_weights_name = None

# get the current time for separating different runs
cur_time = dt.now().strftime('%Y-%m-%dT%H:%M:%S')

if pixel_clustered_dir is None:
    pixel_clustered_dir = 'pixel_mat_clustered_%s' % cur_time

if pixel_consensus_dir is None:
    pixel_consensus_dir = 'pixel_mat_consensus_%s' % cur_time
    
if pixel_weights_name is None:
    pixel_weights_name = 'pixel_weights_%s.feather' % cur_time

Set the following arguments:

* pixel_som_xdim: the number of nodes along the x-axis for the cluster mapping space
* pixel_som_ydim: the number of nodes along the y-axis for the cluster mapping space (total number of cluster assignments xdim * ydim)
* pixel_som_lr_start: the initial learning rate
* pixel_som_lr_end: the final learning rate, lr_start will degrade to lr_end
* pixel_som_num_passes: the number of training passes to make through the dataset

In [None]:
pixel_som_xdim = 10
pixel_som_ydim = 10
pixel_som_lr_start = 0.05
pixel_som_lr_end = 0.01
pixel_som_num_passes = 1

In [None]:
# create the pixel-level SOM weights
som_utils.train_pixel_som(
    fovs,
    channels,
    base_dir,
    sub_dir=subsetted_dir,
    weights_name=pixel_weights_name,
    xdim=pixel_som_xdim,
    ydim=pixel_som_ydim,
    lr_start=pixel_som_lr_start,
    lr_end=pixel_som_lr_end,
    num_passes=pixel_som_num_passes
)

### Assign pixel SOM clusters

In [None]:
# use pixel SOM weights to assign pixel clusters
som_utils.cluster_pixels(
    fovs,
    base_dir,
    pre_dir=preprocessed_dir,
    weights_name=pixel_weights_name,
    cluster_dir=pixel_clustered_dir
)

In [None]:
# visualize the pixel cluster average heatmap
som_utils.visualize_pixel_cluster_data(
    fovs,
    channels,
    base_dir,
    cluster_dir=pixel_clustered_dir,
    pixel_cluster_col='cluster'
)

### Run pixel consensus clustering

Set the following arguments:

* pixel_consensus_max_k: the number of meta clusters to create
* pixel_consensus_cap: the maximum z-score to consider, anything greater will be clipped to this value

In [None]:
pixel_consensus_max_k = 20
pixel_consensus_cap = 3

In [None]:
# run hierarchical clustering based on pixel SOM cluster assignments
som_utils.pixel_consensus_cluster(
    fovs,
    channels,
    base_dir,
    cluster_dir=pixel_clustered_dir,
    consensus_dir=pixel_consensus_dir,
    max_k=pixel_consensus_max_k,
    cap=pixel_consensus_cap
)

In [None]:
# visualize the pixel hierarchical cluster average heatmap
som_utils.visualize_pixel_cluster_data(
    fovs,
    channels,
    base_dir,
    cluster_dir=pixel_consensus_dir,
    pixel_cluster_col='hCluster_cap'
)

### Train cell SOM

* cell_table_name: name of the cell table created by Segment_Image_Data, should be placed in the root of base_dir. You can use either the normalized or arcsinh versions.

In [None]:
cell_table_name = 'cell_table_size_normalized.csv'

Set the paths to your cell clustering data (all paths are placed in base_dir):

* cell_clustered_name: file to store cell data with just the SOM cluster assignments
* cell_consensus_dir: directory to store cell data with the SOM and consensus cluster assignments
* cell_weights_name: file name to place the cell SOM weights

If not set, the variables will default to a file path with the timestamp of the start of the run appended at the end (ex. cell_mat_clustered_{timestamp}.feather)

In [None]:
# define the cell SOM cluster and cell SOM weights paths, explicitly set the variables to use custom names
cell_clustered_name = None
cell_consensus_name = None
cell_weights_name = None

if cell_clustered_name is None:
    cell_clustered_name = 'cell_mat_clustered_%s.feather' % cur_time
    
if cell_consensus_name is None:
    cell_consensus_name = 'cell_mat_consensus_%s.feather' % cur_time
    
if cell_weights_name is None:
    cell_weights_name = 'cell_weights_%s.feather' % cur_time

Set the following arguments:

* cluster_col: whether to count the number of pixel SOM or meta clusters per cell. Use 'cluster' for pixel SOM clusters and 'hCluster_cap' for pixel meta clusters
* cell_som_xdim: the number of nodes along the x-axis for the cluster mapping space
* cell_som_ydim: the number of nodes along the y-axis for the cluster mapping space (total number of cluster assignments xdim * ydim)
* cell_som_lr_start: the initial learning rate
* cell_som_lr_end: the final learning rate, lr_start will degrade to lr_end
* cell_som_num_passes: the number of training passes to make through the dataset

In [None]:
cluster_col = 'cluster'
cell_som_xdim = 10
cell_som_ydim = 10
cell_som_lr_start = 0.05
cell_som_lr_end = 0.01
cell_som_num_passes = 1

In [None]:
# create the cell-level SOM weights
som_utils.train_cell_som(
    fovs,
    base_dir,
    cell_table_name=cell_table_name,
    pixel_consensus_dir=pixel_consensus_dir,
    cluster_col=cluster_col,
    weights_name=cell_weights_name,
    xdim=cell_som_xdim,
    ydim=cell_som_ydim,
    lr_start=cell_som_lr_start,
    lr_end=cell_som_lr_end,
    num_passes=cell_som_num_passes
)

### Assign cell SOM clusters

In [None]:
# use cell SOM weights to assign cell clusters
som_utils.cluster_cells(
    base_dir,
    weights_name=cell_weights_name,
    cell_cluster_name=cell_clustered_name
)

In [None]:
# visualize the cell SOM cluster average heatmap
som_utils.visualize_cell_cluster_data(
    base_dir,
    cluster_name=cell_clustered_name,
    column_prefix=cluster_col,
    cell_cluster_col='cluster'
)

### Run cell consensus clustering

Set the following arguments:

* cell_consensus_max_k: the number of meta clusters to create
* cell_consensus_cap: the maximum z-score to consider, anything greater will be clipped to this value

In [None]:
cell_consensus_max_k = 20
cell_consensus_cap = 3

In [None]:
# run hierarchical clustering based on cell SOM cluster assignments
som_utils.cell_consensus_cluster(
    base_dir=base_dir,
    cell_cluster_name=cell_clustered_name,
    column_prefix=cluster_col,
    cell_consensus_name=cell_consensus_name,
    max_k=cell_consensus_max_k,
    cap=cell_consensus_cap
)

In [None]:
# visualize the cell hierarchical cluster average heatmap
som_utils.visualize_cell_cluster_data(
    base_dir, cluster_name=cell_consensus_name, column_prefix=cluster_col, cell_cluster_col='hCluster_cap'
)