### This notebook runs FlowSOM pixel-level clustering

In [1]:
# import required packages
import os

import sys
sys.path.append('..')
sys.path.append('../ark')

import numpy as np
import pandas as pd
import xarray as xr

from ark.phenotyping import preprocess
from ark.phenotyping import cluster
from ark.utils import io_utils, load_utils

### Set file paths and parameters

In [2]:
# assign file paths
base_dir = "/Users/alexkong/Downloads/granulomaCohort_allData"
tiff_dir = os.path.join(base_dir, "TIFs")
all_data_path = os.path.join(base_dir, "cohortDatav.csv")

In [3]:
# set mibitiff parameters
MIBItiff = False
mibitiff_suffix = '-MassCorrected-Filtered.tiff'

In [4]:
# either get all fovs in the folder...
if MIBItiff:
    fovs = io_utils.list_files(tiff_dir, substrs=MIBItiff_suffix)
else:
    fovs = io_utils.list_folders(tiff_dir)

# ... or optionally, select a specific set of fovs manually
# fovs = ["fov1", "fov2"]

In [2]:
# set the channels to subset over
chan_list = ["CD45", "SMA", "Vimentin", "CD31", "Keratin-pan", "E-cadherin", "MastChyTry",
             "MPO", "CD20", "CD3", "CD14", "HLA-DR-DQ-DP", "Foxp3", "CD16", "CD11c", "CD206",
             "CD11b", "CD68", "CD163", "CD209"]

### Load data

In [6]:
# load the image data
if MIBItiff:
    img_xr = load_utils.load_imgs_from_mibitiff(tiff_dir, mibitiff_files=fovs, channels=chan_list)
else:
    img_xr = load_utils.load_imgs_from_tree(tiff_dir, img_sub_folder="TIFsNoAgg", fovs=fovs, channels=chan_list)

In [7]:
# load the data cohort
all_data = pd.read_csv(all_data_path)

In [8]:
# load the segmentation labels
segmentation_labels = load_utils.load_imgs_from_dir(base_dir,
                                                    xr_dim_name='compartments',
                                                    xr_channel_names=['whole_cell'],
                                                    force_ints=True)

In [9]:
# to make data types consistent, remove 'Point' from segmentation label fov coordinate names
segmentation_labels.coords['fovs'].values = [fov.replace('segmentationmask_SampleID', 'Point') for fov in segmentation_labels.coords['fovs'].values]

### Preprocess the data

In [None]:
# run FlowSOM preprocessing
flowsom_data = preprocess.create_pixel_matrix(img_xr, segmentation_labels)

In [12]:
# write data to csv, start at "Cluster the data" after this
flowsom_data.to_csv(os.path.join('pixel_results', 'example_pixel_matrix.csv'), index=False)

### Train the SOM

In [3]:
# use preprocessed data
flowsom_data = pd.read_csv(os.path.join('pixel_results', 'example_pixel_matrix.csv'))

In [4]:
flowsom_data.head(10)

Unnamed: 0,CD45,SMA,Vimentin,CD31,Keratin-pan,E-cadherin,MastChyTry,MPO,CD20,CD3,...,CD11c,CD206,CD11b,CD68,CD163,CD209,fov,x_coord,y_coord,seg_label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000258,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,61,1
1,0.000224,0.000224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,66,1
2,0.0,0.0,6e-06,6e-06,6e-06,6e-06,6e-06,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,416,1
3,0.0,0.0,0.0,6e-06,6e-06,6e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,417,1
4,3e-06,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,636,1
5,2e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,755,1
6,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,759,1
7,9e-06,7e-06,3e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,760,1
8,4e-06,4e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,761,1
9,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Point35,0,763,1


TODO: implement FlowSOM clustering

In [None]:
# train the som, uses parameters from R SOM package (10 x_neurons, 10 y_neurons, 10 iterations through the dataset)
# not sure how the learning_rate and sigma are calculated, though
weights = cluster.train_flowsom(flowsom_data[chan_list], num_iters=flowsom_data.shape[0] * 10)

In [7]:
np.save(os.path.join('pixel_results', 'som_weights.npy'), weights)

### Cluster the data

In [8]:
weights = np.load(os.path.join('pixel_results', 'som_weights.npy'))

In [10]:
# cluster the data
flowsom_data['cluster_label'] = cluster.cluster_flowsom(flowsom_data[chan_list], weights)

In [11]:
flowsom_data['cluster_label'].value_counts()

(0, 0)    633959
(1, 0)    373880
(2, 0)    329334
(9, 3)    286675
(0, 6)    219845
(4, 0)    186599
(9, 2)    161918
(3, 1)    146521
(3, 9)    123981
(9, 7)    100177
(1, 5)     82594
(4, 9)     74916
(3, 0)     69925
(0, 1)     64402
(1, 9)     59678
(8, 6)     58858
(1, 1)     58834
(9, 5)     58785
(0, 4)     54447
(0, 3)     54100
(2, 4)     52579
(6, 9)     45131
(5, 9)     42588
(8, 7)     40231
(0, 5)     36927
(2, 9)     36180
(5, 4)     36147
(9, 8)     32914
(1, 4)     27765
(7, 7)     27676
           ...  
(8, 3)      3402
(6, 8)      3214
(4, 7)      3143
(6, 5)      3008
(7, 2)      2935
(6, 0)      2935
(1, 3)      2922
(6, 2)      2838
(4, 6)      2761
(6, 3)      2757
(2, 6)      2633
(3, 7)      2483
(5, 6)      2347
(7, 4)      1958
(5, 5)      1895
(4, 4)      1659
(8, 1)      1649
(3, 3)      1448
(6, 4)      1394
(5, 2)      1393
(7, 6)      1330
(7, 3)      1277
(6, 6)       788
(5, 1)       704
(5, 3)       677
(4, 1)       641
(7, 5)       597
(8, 4)       4