### This notebook runs FlowSOM pixel-level clustering

In [1]:
# import required packages
import os

import sys
sys.path.append('..')
sys.path.append('../ark')

import numpy as np
import pandas as pd
import xarray as xr

from ark.phenotyping import preprocess
from ark.phenotyping import cluster
from ark.utils import io_utils, load_utils

### Set file paths and parameters

In [2]:
# assign file paths
base_dir = "/Users/alexkong/Downloads/granulomaCohort_allData"
tiff_dir = os.path.join(base_dir, "TIFs")
all_data_path = os.path.join(base_dir, "cohortDatav.csv")

In [3]:
# set mibitiff parameters
MIBItiff = False
mibitiff_suffix = '-MassCorrected-Filtered.tiff'

In [4]:
# either get all fovs in the folder...
if MIBItiff:
    fovs = io_utils.list_files(tiff_dir, substrs=MIBItiff_suffix)
else:
    fovs = io_utils.list_folders(tiff_dir)

# ... or optionally, select a specific set of fovs manually
# fovs = ["fov1", "fov2"]

In [2]:
# set the channels to subset over
chan_list = ["CD45", "SMA", "Vimentin", "CD31", "Keratin-pan", "E-cadherin", "MastChyTry",
             "MPO", "CD20", "CD3", "CD14", "HLA-DR-DQ-DP", "Foxp3", "CD16", "CD11c", "CD206",
             "CD11b", "CD68", "CD163", "CD209"]

### Load data

In [6]:
# load the image data
if MIBItiff:
    img_xr = load_utils.load_imgs_from_mibitiff(tiff_dir, mibitiff_files=fovs, channels=chan_list)
else:
    img_xr = load_utils.load_imgs_from_tree(tiff_dir, img_sub_folder="TIFsNoAgg", fovs=fovs, channels=chan_list)

In [7]:
# load the data cohort
all_data = pd.read_csv(all_data_path)

In [8]:
# load the segmentation labels
segmentation_labels = load_utils.load_imgs_from_dir(base_dir,
                                                    xr_dim_name='compartments',
                                                    xr_channel_names=['whole_cell'],
                                                    force_ints=True)

In [9]:
# to make data types consistent, remove 'Point' from segmentation label fov coordinate names
segmentation_labels.coords['fovs'].values = [fov.replace('segmentationmask_SampleID', 'Point') for fov in segmentation_labels.coords['fovs'].values]

### Preprocess the data

In [None]:
# run FlowSOM preprocessing
flowsom_data = preprocess.create_pixel_matrix(img_xr, segmentation_labels)

In [12]:
# write data to csv, start at "Cluster the data" after this
flowsom_data.to_csv(os.path.join('pixel_results', 'example_pixel_matrix.csv'), index=False)

### Train the SOM

In [6]:
# use preprocessed data
flowsom_data = pd.read_csv(os.path.join('pixel_results', 'example_pixel_matrix.csv'))

In [7]:
flowsom_data.head(10)

Unnamed: 0,CD45,SMA,Vimentin,CD31,Keratin-pan,E-cadherin,MastChyTry,MPO,CD20,CD3,...,CD11c,CD206,CD11b,CD68,CD163,CD209,fov,x_coord,y_coord,seg_label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000258,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,61,1
1,0.000224,0.000224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,66,1
2,0.0,0.0,6e-06,6e-06,6e-06,6e-06,6e-06,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,416,1
3,0.0,0.0,0.0,6e-06,6e-06,6e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,417,1
4,3e-06,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,636,1
5,2e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,755,1
6,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,759,1
7,9e-06,7e-06,3e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,760,1
8,4e-06,4e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,761,1
9,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,763,1


In [None]:
# train the som, uses parameters from R SOM package (10 x_neurons, 10 y_neurons, 10 passes through the dataset)
weights = cluster.train_som(flowsom_data[chan_list], x_neurons=10, y_neurons=10, num_passes=10)

In [9]:
np.save(os.path.join('pixel_results', 'som_weights.npy'), weights)

### Cluster the data

In [8]:
weights = np.load(os.path.join('pixel_results', 'som_weights.npy'))

In [9]:
weights

array([[[1.17192684e-06, 1.31546958e-06, 1.38393244e-06, ...,
         3.22568118e-12, 1.05033620e-09, 8.09850078e-09],
        [8.64176838e-07, 1.22021061e-06, 1.45288661e-06, ...,
         9.25687583e-15, 1.18013647e-13, 1.08571619e-11],
        [6.62868365e-07, 9.43217141e-07, 1.22443344e-06, ...,
         7.97446105e-12, 9.85087507e-15, 6.61379556e-14],
        ...,
        [2.38096624e-07, 4.78722989e-07, 4.05807440e-07, ...,
         6.28139082e-10, 8.66790014e-15, 3.97685006e-09],
        [6.23612073e-07, 5.42861926e-07, 3.66708349e-07, ...,
         8.98187697e-12, 1.63420453e-15, 1.14222496e-10],
        [7.34295224e-07, 6.27579479e-07, 3.25655346e-07, ...,
         1.06555795e-11, 1.27538959e-12, 3.80828403e-10]],

       [[1.02047076e-06, 9.33758828e-07, 8.84154194e-07, ...,
         9.43456862e-15, 1.02024737e-13, 3.48441409e-10],
        [7.61762718e-07, 9.51748579e-07, 1.04799187e-06, ...,
         1.34660006e-13, 1.71601696e-13, 1.43651063e-11],
        [5.65710542e-07, 

In [10]:
# cluster the data
flowsom_data['cluster_label'] = cluster.cluster_som(flowsom_data[chan_list], weights)

In [11]:
flowsom_data['cluster_label'].value_counts()

1     633959
3     373880
4     329334
38    286675
14    219845
39    186599
7     161918
20    146521
0     123981
22    100177
35     82594
5      74916
21     69925
2      64402
12     59678
6      58858
18     58834
13     58785
33     54447
23     54100
15     52579
44     45131
19     42588
11     40231
66     36927
16     36180
47     36147
51     32914
52     27765
9      27676
       ...  
87      3402
72      3214
74      3143
59      3008
82      2935
45      2935
29      2922
85      2838
75      2761
92      2757
61      2633
73      2483
50      2347
83      1958
77      1895
46      1659
99      1649
64      1448
86      1394
79      1393
71      1330
84      1277
78       788
97       704
96       677
80       641
94       597
91       493
98       322
88       245
Name: cluster_label, Length: 100, dtype: int64