# Validations

### Common Imports

In [1]:
import sys
import os

root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import torch
import numpy
import matplotlib.pyplot as plt
from tissue_purifier.data_utils.datamodule import SlideSeqKidneyDM, SlideSeqTestisDM
from tissue_purifier.plot_utils.plot_images import show_raw_all_channels, show_raw_one_channel
from tissue_purifier.plot_utils.plot_misc import plot_composition

In [3]:
import neptune.new as neptune
NEPTUNE_TOKEN = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjZTkyYmJiYi0wN2E1LTRkY2YtOWU3Ny1kNjhjYmM3ZTVkNWEifQ=="
NEPTUNE_PROJECT = "cellarium/tissue-purifier"

logging_mode = 'async'

exp: neptune.run.Run = neptune.init(project=NEPTUNE_PROJECT,
                                    api_token=NEPTUNE_TOKEN,
                                    mode=logging_mode,
                                    tags=["evaluate_embeddings"])

https://app.neptune.ai/cellarium/tissue-purifier/e/TIS-909
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


# Load and evaluate the emebeddings

In [4]:
all_features_dict = torch.load("all_features_dist.pt")

In [6]:
inspect_dict(all_features_dict)

NameError: name 'inspect_dict' is not defined

# TODO

1. Evaluate the embeddings
can the patch embeddings predict: 
- tissue_label, 
- tissue_condition, 
- moran_score

Compare features_composition vs dino_features

2. Classification with noisy labels
3. annotate the clusters in terms of cell composition (use scCODA)
4. do DE based on those clusters

In [None]:
from tissue_purifier.misc_utils.spatial_util import SpatialAutocorrelation
from tissue_purifier.misc_utils.misc import inspect_dict
from tissue_purifier.misc_utils.misc import SmartPca, SmartUmap, SmartLeiden
from tissue_purifier.plot_utils.plot_embeddings import plot_embeddings

In [None]:
smartpca = SmartPca(preprocess_strategy='raw')
embeddings = smartpca.fit_transform(data_compositions, n_components=2)

fig_composition = plot_embeddings(
        embeddings,
        x_label='pca1',
        y_label='pca2',
        title='tissue composition pca',
        size=20,
        cmap=plt.cm.viridis,
        labels=list_f_names,
        figsize=(6,6),
        legend=True,)

umap = SmartUmap(preprocess_strategy='raw', metric='cosine')
embeddings = umap.fit_transform(data_compositions)

graph = umap.get_graph()
leiden = SmartLeiden(graph, directed=True)
labels = leiden.cluster(resolution= 1.0)

plt.scatter(embeddings[:,0], embeddings[:, 1], c=labels)

torch.save(data, "cell_counters.pt")

### Read from file and re-create the model and datamodule

In [None]:

ckpt_file = 'ckpt_TIS_723.pt'  # do 723 after

# Get the model from checkpoint
dino = DinoModel.load_from_checkpoint(ckpt_file, strict=False)
config = dino._hparams
config["gpus"] = min(1, torch.cuda.device_count())
config["n_crops_for_tissue_test"] = 20 

In [None]:
from tissue_purifier.model_utils.classify_regress import classify_and_regress
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier

mlp_kargs = {
    "hidden_layer_sizes" : [],
    "solver": 'adam',
    "alpha" : 0.0001,
    "batch_size" : 'auto',
    "learning_rate" : 'constant',
    "learning_rate_init" : 0.0003,
    "max_iter" : 50000,
    "shuffle" : True,
    "random_state" : 1,
    "tol" : 1E-6,
    "verbose" : False,
    "n_iter_no_change" : 10,
    "early_stopping" :False,
}

kn_kargs = {
    "n_neighbors": 5,
}

feature_keys, regress_keys, classify_keys = [], [], []
for key in embeddings_dict.keys():
    #if key.startswith("pca") or key.startswith("umap") or key in ['features_teacher_bbone', 'features_composition']:
    if key in ['features_teacher_bbone', 'features_composition']:
        feature_keys.append(key)
    elif key.startswith("regress"):
        regress_keys.append(key)
    elif key.startswith("classify"):
        classify_keys.append(key)

#result_dict_mlp = classify_and_regress(
#    input_dict=embeddings_dict, 
#    feature_keys=feature_keys, 
#    regress_keys=regress_keys,
#    classify_keys=classify_keys,
#    regressor=MLPRegressor(**mlp_kargs),
#    classifier=MLPClassifier(**mlp_kargs),
#    add_prediction=False, 
#    verbose=True)

result_dict_kn = classify_and_regress(
    input_dict=embeddings_dict, 
    feature_keys=feature_keys, 
    regress_keys=regress_keys,
    classify_keys=classify_keys,
    regressor=KNeighborsRegressor(**kn_kargs),
    classifier=KNeighborsClassifier(**kn_kargs),
    add_prediction=False, 
    verbose=True)

In [None]:
from tissue_purifier.plot_utils.plot_misc import plot_bars

result_dict = result_dict_kn

keys_metric = set()
for kf, d_tmp in result_dict.items(): 
    for key in d_tmp.keys():
        if key.endswith("accuracy_test") or key.endswith("r2_test"):
            keys_metric.add(key)
            
print(keys_metric)

def compare_features_according_to_metric(dict_result, key_metric):
    x_labels = []
    metric_values = []
    for kf, kf_dict in dict_result.items():
        if key_metric in kf_dict.keys():
            x_labels.append(kf)
            metric_values.append(kf_dict[key_metric])
    return x_labels, metric_values

figs = []
for key_metric in keys_metric:
    x_labels, y_values = compare_features_according_to_metric(result_dict, key_metric)
    fig = plot_bars(y_values, x_labels=x_labels, title=key_metric)
    figs.append(fig)
    exp["feature_comparison"].log(neptune.types.File.as_image(fig))

In [None]:
result_dict_kn

In [None]:
figs[2]

In [None]:
exp.stop()