# Validations

### Common Imports

In [1]:
import sys
import os

root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import torch
import numpy
import matplotlib.pyplot as plt
from tissue_purifier.data_utils.datamodule import SlideSeqKidneyDM, SlideSeqTestisDM
from tissue_purifier.plot_utils.plot_images import show_raw_all_channels, show_raw_one_channel
from tissue_purifier.plot_utils.plot_misc import plot_composition

In [3]:
import neptune.new as neptune
NEPTUNE_TOKEN = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjZTkyYmJiYi0wN2E1LTRkY2YtOWU3Ny1kNjhjYmM3ZTVkNWEifQ=="
NEPTUNE_PROJECT = "cellarium/tissue-purifier"

dataset = "testis_dataset" 
logging_mode = "offline" # or "async"

if dataset == "kidney_dataset":
    DM = SlideSeqKidneyDM
elif dataset == "testis_dataset":
    DM = SlideSeqTestisDM
else:
    raise Exception()

exp: neptune.run.Run = neptune.init(project=NEPTUNE_PROJECT,
                                    api_token=NEPTUNE_TOKEN,
                                    mode=logging_mode,
                                    tags=["validation", dataset])
    
config_dict = DM.get_default_params()
    
config_dict['n_crops_for_tissue_test'] = 20
config_dict['dropout_range'] = [0.0, 0.2]
config_dict['cohort'] = 'all'

# print(config_dict)

dm = DM(**config_dict)
dm.prepare_data()
dm.setup(stage=None)

offline/e56af8ab-fcfc-4bd5-bb36-57f40c78e4e8
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
-----> running datamodule init
-----> running datamodule prepare_data
untar data
number of elements ---> 31659
mean and median spacing 15.622264926052864, 15.420071793081707
The dense shape of the image is -> torch.Size([9, 1178, 1178])
Occupacy (zero, single, double, ...) of voxels in 3D sparse array -> [    0 31649     5]
Occupacy (zero, single, double, ...) of voxels  in 2D sparse array (summed over category) -> [    0 31629    15]
number of elements ---> 33059
mean and median spacing 15.358030584634598, 15.508166323067783
The dense shape of the image is -> torch.Size([9, 1180, 855])
Occupacy (zero, single, double, ...) of voxels in 3D sparse array -> [    0 33033    13]
Occupacy (zero, single, double, ...) of voxels  in 2D s

### Get the full images fron the train dataloaders

In [50]:
train_loader = dm.train_dataloader()
train_dataset = train_loader.dataset
sp_images = train_dataset.imgs
metadatas = train_dataset.metadatas
f_names = [meta.f_name for meta in metadatas]
cell_to_code_dict = sp_images[0]._categories_to_codes

### 1. Analyze sparse images by tiling them with patches

In [5]:
from tissue_purifier.misc_utils.dict_util import inspect_dict
from tissue_purifier.model_utils.analyzer import SpatialAutocorrelation, Composition

# ADD MORE MODELS, more features, etc

N_patches_max = 100

analyzers = [
    Composition(return_fraction=True), 
    SpatialAutocorrelation(modality='moran', n_neighbours=6, neigh_correct=True)]

feature_names= [
    "feature_composition", 
    "moran_I"]

for sp_img, f_name in zip(train_dataset.imgs, f_names):
    sp_img.analyze_with_tiling(
        cropper=dm.cropper_test,
        patch_analyzers=analyzers,
        feature_names=feature_names,
        batch_size=64,
        n_patches_max=N_patches_max,
        overwrite=True)
    sp_img.patch_properties_dict['classify_tissue_label'] = [f_name] * N_patches_max
    
# inspect_dict(sp_img.patch_properties_dict)

# 2. Concatenate the embeddings from all the images

In [8]:
from tissue_purifier.misc_utils.dict_util import concatenate_list_of_dict

list_of_dict = [sp_img.patch_properties_dict for sp_img in sp_images]
all_features_dict = concatenate_list_of_dict(list_of_dict)
inspect_dict(all_features_dict)

feature_composition <class 'torch.Tensor'> torch.Size([600, 9]) cpu
moran_I <class 'torch.Tensor'> torch.Size([600, 9]) cpu
patch_xywh <class 'torch.Tensor'> torch.Size([600, 4]) cpu
classify_tissue_label <class 'list'> 600


### 3. Perform PCA, UMAP, Leiden on the learned features

In [9]:
from tissue_purifier.misc_utils.misc import SmartPca, SmartUmap, SmartLeiden

In [38]:
feature_keys = ['feature_composition'] #features_teacher_bbone', 'features_composition']

for key in feature_keys:
    # print("working on -->", key)
    if key.endswith("composition"):
        smart_pca = SmartPca(preprocess_strategy='z_score')
        smart_umap = SmartUmap(n_neighbors=25, preprocess_strategy='raw', n_components=2, min_dist=0.5, metric='cosine')
        input_features = all_features_dict[key]
        embeddings_pca = smart_pca.fit_transform(input_features, n_components=0.9)
        embeddings_umap = smart_umap.fit_transform(input_features)
        umap_graph = smart_umap.get_graph()
    else:
        smart_pca = SmartPca(preprocess_strategy='z_score')
        smart_umap = SmartUmap(n_neighbors=25, preprocess_strategy='raw', n_components=2, min_dist=0.5, metric='euclidean')
        input_features = all_features_dict[key]
        embeddings_pca = smart_pca.fit_transform(input_features, n_components=0.9)
        embeddings_umap = smart_umap.fit_transform(embeddings_pca)
        umap_graph = smart_umap.get_graph()
    
    all_features_dict["pca_"+key] = torch.from_numpy(embeddings_pca)
    all_features_dict["umap_"+key] = torch.from_numpy(embeddings_umap)
    
    smart_leiden = SmartLeiden(graph=umap_graph)
    for resolution in [0.01, 0.1, 0.3, 0.5]:
        cluster_labels = smart_leiden.cluster(resolution=resolution)
        all_features_dict["leiden_res_"+str(resolution)+"_"+key] = torch.nn.functional.one_hot(torch.from_numpy(cluster_labels).long())  # need to make it one-hot so that I can average 

In [37]:
# inspect_dict(all_features_dict)

Plot a lot of maps

In [51]:
from tissue_purifier.plot_utils.plot_embeddings import plot_all_maps

maps = plot_all_maps(all_features_dict, legend=False)

for tmp in maps:
    exp["maps"].log(neptune.types.File.as_image(tmp))
    
#maps[3]

Add some quantities to regress and classify

In [36]:
#inspect_dict(all_features_dict)

all_features_dict['regress_moran'] = torch.max(all_features_dict['moran_I'], dim=-1)[0]
all_features_dict['classify_condition'] = ['wt' if label.startswith('wt') else 'dis' for label in all_features_dict['classify_tissue_label']]

#inspect_dict(all_features_dict)

In [52]:
torch.save(all_features_dict, "all_features_dist.pt")

# 3. Transfer the annotations to the sparse_images.patch_properties_dict

In [41]:
from tissue_purifier.misc_utils.dict_util import transfer_annotations_between_dict
    
anchor_key = 'patch_xywh'
annotation_keys = list(all_features_dict.keys())
annotation_keys.remove('patch_xywh')
    
for sp_img in sp_images:    
    transfer_annotations_between_dict(
        source_dict=all_features_dict,
        dest_dict=sp_img.patch_properties_dict,
        annotation_keys=annotation_keys,
        anchor_key=anchor_key,
    )

#inspect_dict(sp_images[0].patch_properties_dict)

### 2. From path property to image property

In [42]:
annotation_keys = list(sp_images[0].patch_properties_dict.keys())
annotation_keys.remove('classify_condition')
annotation_keys.remove('classify_tissue_label')
annotation_keys.remove('patch_xywh')
print(annotation_keys)

for sp_img in sp_images:
    sp_img.path_property_to_image_property(
        keys=annotation_keys,
        overwrite=True,
        verbose=False)

#inspect_dict(image_dict)

['feature_composition', 'moran_I', 'pca_feature_composition', 'umap_feature_composition', 'leiden_res_0.01_feature_composition', 'leiden_res_0.1_feature_composition', 'leiden_res_0.3_feature_composition', 'leiden_res_0.5_feature_composition', 'regress_moran']
The key feature_composition is already present in image_properties_dict.                         This value will be overwritten
The key moran_I is already present in image_properties_dict.                         This value will be overwritten
The key pca_feature_composition is already present in image_properties_dict.                         This value will be overwritten
The key umap_feature_composition is already present in image_properties_dict.                         This value will be overwritten
The key leiden_res_0.01_feature_composition is already present in image_properties_dict.                         This value will be overwritten
The key leiden_res_0.1_feature_composition is already present in image_properties_dict.

### 3. Visualize the Image Properties

In [20]:
from tissue_purifier.plot_utils.plot_images import show_raw_all_channels, show_raw_one_channel

Plot the cell composition

In [43]:
cell_names = list(cell_to_code_dict.keys())

for sp_img, f_name in zip(sp_images, f_names):
    fig = show_raw_one_channel(sp_img.image_properties_dict["feature_composition"], n_col=3, in_range=(0.0, 1.0),
                    titles=list(cell_to_code_dict.keys()), sup_title="Cell Composition of {0}".format(f_name), cmap="magma")
    exp["masks/cell_composition"].log(neptune.types.File.as_image(fig))
#fig 

In [44]:
for cell_type, code in cell_to_code_dict.items():
    # print(cell_type, code)
    fig = show_raw_one_channel([sp_img.image_properties_dict["feature_composition"][code] for sp_img in sp_images], n_col=3, 
                               in_range=(0.0, 1.0),
                               titles=f_names, sup_title="Cell Composition of {0}".format(cell_type), cmap="magma")
    exp["masks/cell_composition"].log(neptune.types.File.as_image(fig))

#fig

Plot the moran score

In [45]:
fig = show_raw_one_channel([sp_img.image_properties_dict["regress_moran"][0] for sp_img in sp_images], n_col=3, 
                            scale_each=False, in_range='image', 
                            titles=f_names, sup_title="Moran score", cmap="magma")
exp["masks/moran"].log(neptune.types.File.as_image(fig))
#fig

Plot the Lieden cluster

In [46]:
image_keys = sp_images[0].image_properties_dict.keys()
for key in image_keys:
    if key.startswith("leiden"):
        print(key)
        fig = show_raw_all_channels([sp_img.image_properties_dict[key] for sp_img in sp_images], n_col=3, titles=f_names, sup_title=key, cmap=plt.cm.tab10, show_colorbar=False)
        exp["masks/leiden/all_chs"].log(neptune.types.File.as_image(fig))
#fig

leiden_res_0.01_feature_composition
leiden_res_0.1_feature_composition
leiden_res_0.3_feature_composition
leiden_res_0.5_feature_composition


In [47]:
image_keys = sp_images[0].image_properties_dict.keys()
for key in image_keys:
    if key.startswith("leiden"):
        for sp_img, f_name in zip(sp_images, f_names):
            n_clusters = sp_img.image_properties_dict[key].shape[-3]
            titles = ["leiden cluster: "+str(n) for n in range(n_clusters)]
            fig = show_raw_one_channel(sp_img.image_properties_dict[key], sup_title="{0} by {1}".format(f_name,key),
                                      titles=titles)
            exp["masks/leiden/"+key].log(neptune.types.File.as_image(fig))
# fig

Plot the PCA components

In [48]:
for key in image_keys:
    if key.startswith("pca"):
        print(key)
        fig = show_raw_one_channel([sp_img.image_properties_dict[key][0] for sp_img in sp_images], n_col=3, titles=f_names, sup_title="PCA1 by "+key, cmap="seismic")
        exp["masks/pca"].log(neptune.types.File.as_image(fig))
        fig = show_raw_one_channel([sp_img.image_properties_dict[key][1] for sp_img in sp_images], n_col=3, titles=f_names, sup_title="PCA2 by "+key, cmap="seismic")
        exp["masks/pca"].log(neptune.types.File.as_image(fig))
# fig

pca_feature_composition


Plot the UMAP components

In [49]:
for key in image_keys:
    if key.startswith("umap"):
        print(key)
        fig = show_raw_one_channel([sp_img.image_properties_dict[key][0] for sp_img in sp_images], n_col=3, titles=f_names, sup_title="UMAP1 by "+key, cmap="seismic")
        exp["masks/umap"].log(neptune.types.File.as_image(fig))
        fig = show_raw_one_channel([sp_img.image_properties_dict[key][1] for sp_img in sp_images], n_col=3, titles=f_names, sup_title="UMAP2 by "+key, cmap="seismic")
        exp["masks/umap"].log(neptune.types.File.as_image(fig))
        
# fig

umap_feature_composition


### 4. Transfer annotation on spot

In [None]:
# inspect_dict(sp_images[0].image_properties_dict)

keys_image = list(sp_images[0].image_properties_dict.keys())
keys_image.remove("umap_feature_composition")
keys_image.remove("pca_feature_composition")
keys_image.remove("moran_I")

for sp_img in sp_images:
    sp_img.image_property_to_spot_property(
        keys=keys_image,
        overwrite=True)
    
# inspect_dict(sp_images[0].spot_properties_dict)

# Usage statistics of clusters

In [54]:

TODO

NameError: name 'TODO' is not defined

In [28]:
from tissue_purifier.plot_utils.plot_misc import plot_counters
from tissue_purifier.misc_utils.misc import channel_counter_in_window

ImportError: cannot import name 'plot_counters' from 'tissue_purifier.plot_utils.plot_misc' (/mnt/disks/additional_persistent_disk/REPOS/ML_for_slideseq/src/tissue_purifier/plot_utils/plot_misc.py)

In [None]:
for key in image_keys:
    if key.startswith("leiden"):
        print(key)
        leiden_counters = [channel_counter_in_window(sp_img.image_properties_dict[key]) for sp_img in sp_images]
        fig = plot_counters(leiden_counters, dataset_labels=f_names, title="normalized utilization of clusters by {0}".format(key))
        exp["usage"].log(neptune.types.File.as_image(fig))

In [None]:
cell_types = list(sp_images[0]._categories_to_codes.keys())

cell_counters = [channel_counter_in_window(sp_img) for sp_img in sp_images]
fig = plot_counters(cell_counters, dataset_labels=f_names, title="normalized utilization of cells", x_labels=cell_types)
exp["usage"].log(neptune.types.File.as_image(fig))

# Now you can do DE

In [53]:
TO DO

SyntaxError: invalid syntax (<ipython-input-53-f091c5049fd2>, line 1)

In [None]:
gather all spot dict from the sparse images.... do DE...

In [None]:
exp.stop()