### This notebook runs FlowSOM pixel-level clustering

In [1]:
# import required packages
import os
import subprocess

import sys
sys.path.append('..')
sys.path.append('../ark')

import numpy as np
import pandas as pd
import xarray as xr

from ark.phenotyping import preprocess
from ark.phenotyping import cluster
from ark.utils import io_utils, load_utils

### Set file paths and parameters

In [2]:
# assign file paths
base_dir = "../data/granulomaCohort_allData"
tiff_dir = os.path.join(base_dir, "TIFs")
all_data_path = os.path.join(base_dir, "cohortDatav.csv")

In [3]:
# set mibitiff parameters
MIBItiff = False
mibitiff_suffix = '-MassCorrected-Filtered.tiff'

In [4]:
# either get all fovs in the folder...
if MIBItiff:
    fovs = io_utils.list_files(tiff_dir, substrs=MIBItiff_suffix)
else:
    fovs = io_utils.list_folders(tiff_dir)

# ... or optionally, select a specific set of fovs manually
fovs = ["Point6", "Point7"]

In [5]:
# set the channels to subset over
chan_list = ["CD45", "SMA", "Vimentin", "CD31", "Keratin-pan", "E-cadherin", "MastChyTry",
             "MPO", "CD20", "CD3", "CD14", "HLA-DR-DQ-DP", "Foxp3", "CD16", "CD11c", "CD206",
             "CD11b", "CD68", "CD163", "CD209"]

### Load data

In [6]:
# load the image data
if MIBItiff:
    img_xr = load_utils.load_imgs_from_mibitiff(tiff_dir, mibitiff_files=fovs, channels=chan_list, dtype="int16")
else:
    img_xr = load_utils.load_imgs_from_tree(tiff_dir, img_sub_folder="TIFsNoAgg", fovs=fovs, channels=chan_list, dtype="int16")

In [7]:
# load the data cohort
all_data = pd.read_csv(all_data_path)

In [8]:
# load the segmentation labels
segmentation_labels = load_utils.load_imgs_from_dir(base_dir,
                                                    xr_dim_name='compartments',
                                                    xr_channel_names=['whole_cell'],
                                                    force_ints=True)

In [9]:
# to make data types consistent, remove 'Point' from segmentation label fov coordinate names
segmentation_labels = segmentation_labels.assign_coords(fovs=[fov.replace('segmentationmask_SampleID', 'Point') for fov in segmentation_labels.coords['fovs'].values])
# segmentation_labels.coords['fovs'].values = [fov.replace('segmentationmask_SampleID', 'Point') for fov in segmentation_labels.coords['fovs'].values]

### Preprocess

In [10]:
# run FlowSOM preprocessing
flowsom_data = preprocess.create_pixel_matrix(img_xr, segmentation_labels)

In [17]:
# write data to csv, start at "Cluster the data" after this
flowsom_data.to_csv(os.path.join(base_dir, 'example_pixel_matrix.csv'), index=False)

### Train the SOM

In [23]:
!Rscript ../som_runner.R {os.path.join(base_dir, 'example_pixel_matrix.csv')} {','.join(chan_list).replace('E-cadherin', '')} {base_dir}

Loading required package: igraph

Attaching package: ‘igraph’

The following objects are masked from ‘package:stats’:

    decompose, spectrum

The following object is masked from ‘package:base’:

    union

[1] "Reading command args"
[1] "Getting pixel mat data path"
[1] "Creating marker list"
 [1] "CD45"         "SMA"          "Vimentin"     "CD31"         "Keratin-pan" 
 [6] ""             "MastChyTry"   "MPO"          "CD20"         "CD3"         
[11] "CD14"         "HLA-DR-DQ-DP" "Foxp3"        "CD16"         "CD11c"       
[16] "CD206"        "CD11b"        "CD68"         "CD163"        "CD209"       
[1] "Getting the write directory"
[1] "Reading the pixel matrix data"
In fread(pixelMatPath, select = markers) :
  Column name '' not found in column name header (case sensitive), skipping.
               CD45 SMA  Vimentin CD31 Keratin-pan MastChyTry MPO CD20 CD3
      1: 0.01539873   0 0.6582304    0           0          0   0    0   0
      2: 0.02142765   0 0.6352314    0      