# Contrastive Learning

### Add the src folder to the path

In [1]:
import sys
import os

root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.insert(0, src_path)

In [2]:
%load_ext autoreload
%autoreload 2

In [5]:
%matplotlib inline
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import pytorch_lightning as pl
import tissue_purifier as tp
from tissue_purifier.data_utils.helpers import define_train_loader, define_test_loader
from tissue_purifier.model_utils.simclr import SimCLR
from pytorch_lightning.loggers import NeptuneLogger
from tissue_purifier.data_utils import SpatialAutocorrelation
from tissue_purifier.plot_utils import plot_all_maps

In [6]:
from tissue_purifier.model_utils.helpers import define_simclr


### Read in the config file

In [7]:
NEPTUNE_API_TOKEN="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjZTkyYmJiYi0wN2E1LTRkY2YtOWU3Ny1kNjhjYmM3ZTVkNWEifQ=="

In [33]:
def flatten_dict(dd, separator='_', prefix=''):
    return {prefix + separator + k if prefix else k: v
            for kk, vv in dd.items()
            for k, v in flatten_dict(vv, separator, kk).items()
            } if isinstance(dd, dict) else {prefix: dd}

with open("./config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

In [34]:
simclr = define_simclr(config)

In [44]:
transf1 = simclr.predict_transform["sampler"]
transf2 = simclr.predict_transform["reference"]

In [45]:
transf1

Compose(
    DropoutSparseTensor(dropout=(0.0,0.4), same_dropout_rate_for_all_elements=False)
    SparseToDense
    RandomGaussianBlur(sigma=(1.0,4.0))
    RandomAffine(degrees=[-180.0, 180.0], translate=(0.05, 0.05), scale=(0.75, 1.25), shear=[-0.0, 0.0])
    CenterCrop(size=(224, 224))
    RandomStraightCut(p=0.5, occlusion_fraction=0.05)
    ScaleIntensity(in_range=image, out_range=(0.0, 1.0))
    RandomIntensity(factor=(0.7,1.3)
    RandomVerticalFlip(p=0.5)
    RandomHorizontalFlip(p=0.5)
    Resize(size=224, interpolation=bilinear)
    StackTensor(dim=-4)
)

In [46]:
transf2

Compose(
    SparseToDense
    StackTensor(dim=-4)
    RandomGaussianBlur(sigma=(1.0,4.0))
    CenterCrop(size=(224, 224))
    ScaleIntensity(in_range=image, out_range=(0.0, 1.0))
    Resize(size=224, interpolation=bilinear)
)

In [None]:
vars(simclr)

In [None]:
flatten_config = flatten_dict(config, separator="_", prefix='config')

neptune_logger = NeptuneLogger(
    offline_mode=True,
    api_key=NEPTUNE_API_TOKEN,
    project_name='cellarium/tissue-purifier', # change this to your project
    experiment_name="Contrastive Learning",
    tags=[config["model_settings"]["BACKBONE_TYPE"]],
    upload_source_files=["config.yaml", "main*.py", "tissue_purifier/data_utils/helpers.py"]
)

for k,v in flatten_config.items():
    neptune_logger.experiment.log_text(k,str(v))

print(yaml.dump(config))

## Manual seeding

In [None]:
torch.manual_seed(config['simulation']['RANDOM_SEED'])
np.random.seed(config['simulation']['RANDOM_SEED'])
pl.seed_everything(config['simulation']['RANDOM_SEED'])

## Load the data

In [None]:
data_folder = "../slide-seq-data"
model_folder = "../trained_model"

In [None]:
df_wt1 = pd.read_csv(os.path.join(data_folder, "wt_1.csv"))
df_wt2 = pd.read_csv(os.path.join(data_folder, "wt_2.csv"))
df_wt3 = pd.read_csv(os.path.join(data_folder, "wt_3.csv"))
df_dis1 = pd.read_csv(os.path.join(data_folder, "sick_1.csv"))
df_dis2 = pd.read_csv(os.path.join(data_folder, "sick_2.csv"))
df_dis3 = pd.read_csv(os.path.join(data_folder, "sick_3.csv"))

all_df = [df_wt1, df_wt2, df_wt3, df_dis1, df_dis2, df_dis3]
all_labels_sparse_images = [0, 0, 0, 1, 1, 1]
all_names_sparse_images = ["wt1", "wt2", "wt3", "dis1", "dis2", "dis3"]
all_sparse_images = [tp.data_utils.SparseImage.from_panda(
      df, "x", "y", category="cell_type", pixel_size=config["simulation"]["PIXEL_SIZE"], padding=10)
                   for df in all_df]


## Initialization and training

In [None]:
model = tp.model_utils.helpers.define_model(
    backbone_type=config["model_settings"]["BACKBONE_TYPE"],
    number_of_channels=config["model_settings"]["INPUT_CHANNELS"],
    num_of_filters=config["model_settings"]["BACKBONE_NUM_FTRS"],
    projection_out_dim=config["model_settings"]["PROJECTION_OUT_DIM"],
)

In [None]:
simulation_type = config["simulation"]["TYPE"]
print("simulation_type ->", simulation_type)

In [None]:
if simulation_type == 'scratch' or simulation_type == 'resume':
    optimizer, scheduler = tp.model_utils.helpers.define_optimizer_and_scheduler(
        model=model,
        num_epochs=config["simulation"]["MAX_EPOCHS"],
        optimizer_type=config["optimizer"]["OPTIMIZER_TYPE"],
        learning_rate=config["optimizer"]["LEARNING_RATE"],
        is_scheduled=config["optimizer"]["IS_SCHEDULED"],
        scheduler_step_size=config["optimizer"]["SCHEDULER_STEP_SIZE"],
        scheduler_gamma=config["optimizer"]["SCHEDULER_GAMMA"]
    )

    print("Define the trainloader")
    trainloader_train, _ = define_trainloaders(
        all_sparse_images,
        all_labels_sparse_images,
        all_names_sparse_images,
        config,
        use_train_test_transform=(True, False),
        paired_output = True
    )

    criterion = lightly.loss.NTXentLoss()
    encoder = TrainableEncoder(model, criterion, optimizer, trainloader_train, scheduler)

    if simulation_type == 'resume':
        ckpt = torch.load("ckpt.pt")
        encoder.model.load_state_dict(ckpt)

    print("start training")
    encoder.train_embedding(
        gpus=1 if torch.cuda.is_available() else 0,
        progress_bar_refresh_rate=0,
        max_epochs=config["simulation"]["MAX_EPOCHS"],
        log_every_n_steps=10,
        logger=neptune_logger
    )
    print("end training")
    os.makedirs(model_folder, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(model_folder, "simclr_model.pt"))
    neptune_logger.log_artifact(os.path.join(model_folder, "simclr_model.pt"))
    print("model written to file")

elif simulation_type == 'fully_trained':
    encoder = TrainedEncoder(model)
    ckpt = torch.load("ckpt.pt")
    encoder.model.load_state_dict(ckpt)
    print("loaded model from file")
else:
    raise Exception("simulation_type not recognized. Received {0}".format(simulation_type))

# Embeddings

In [None]:
print("Evaluation begings!")
print("Define the loaders!")
loader_train, loader_test = define_testloaders(all_sparse_images,
                                               all_labels_sparse_images,
                                               all_names_sparse_images,
                                               config,
                                               use_train_test_transform=(True,True))

sparse_tensor_callback = SpatialAutocorrelation(modality='moran',
                                                n_neighbours=6,
                                                radius=None,
                                                neigh_correct=False)

#sparse_tensor_callback = None

In [None]:
print("computing the embeddings")
output_embed_test: EmbedOutput = encoder.embed(loader_test, sparse_tensor_callback)
output_embed_train: EmbedOutput = encoder.embed(loader_train,  sparse_tensor_callback)

torch.save(output_embed_test, "./output_embed_test.pt")
torch.save(output_embed_train, "./output_embed_train.pt")

neptune_logger.log_artifact("./output_embed_test.pt")
neptune_logger.log_artifact("./output_embed_train.pt")
print("embeddings logged in neptune")

## Load the mebedding if necessary

In [None]:
#embedding_folder = "./embeddings"
#
#output_bbone_test = torch.load(os.path.join(embedding_folder, "output_bbone_test.pt"), map_location=torch.device('cpu'))
#output_bbone_train = torch.load(os.path.join(embedding_folder, "output_bbone_train.pt"), map_location=torch.device('cpu'))
#output_head_test = torch.load(os.path.join(embedding_folder, "output_head_test.pt"), map_location=torch.device('cpu'))
#output_head_train = torch.load(os.path.join(embedding_folder, "output_head_train.pt"), map_location=torch.device('cpu'))

# knn graph

In [None]:
print("compute the knn only for head with cosine")
# random_samples = torch.randperm(len(loader_test.dataset.x))[:10]
random_samples = [230, 217, 256, 196, 192, 272, 240, 109, 161, 265]
print("random_samples", random_samples)

knn_test = tp.plot_utils.plot_knn_examples(output_head_test.embeddings,
                                           loader_test,
                                           n_neighbors=5,
                                           examples=random_samples,
                                           cmap=plt.cm.viridis,
                                           intensity_scale_factor=2.5,
                                           metric="cosine",
                                           plot_histogram=True,
                                           bins=50)

knn_train = tp.plot_utils.plot_knn_examples(output_head_train.embeddings,
                                            loader_train,
                                            n_neighbors=5,
                                            examples=random_samples,
                                            cmap=plt.cm.viridis,
                                            intensity_scale_factor=2.5,
                                            metric="cosine",
                                            plot_histogram=True,
                                            bins=50)

In [None]:
knn_test

In [None]:
knn_train

In [None]:
#display(maps_bbone_train_v2[1])

In [None]:
#display(maps_bbone_test_v1[1])

In [None]:
#display(maps_bbone_test_v2[2])

## Maps of the projection head

In [None]:
print("computing the maps")

maps_head_test_intra = plot_all_maps(embed_output=output_head_test,
                                     cmap=plt.cm.inferno,
                                     figsize=(5,5),
                                     remove_intra_tissue_connectivity=False,
                                     title_suffix="test intra")

maps_head_train_intra = plot_all_maps(embed_output=output_head_train,
                                      cmap=plt.cm.inferno,
                                      figsize=(5,5),
                                      remove_intra_tissue_connectivity=False,
                                      title_suffix="train intra")

maps_head_test_NO_intra = plot_all_maps(embed_output=output_head_test,
                                        cmap=plt.cm.inferno,
                                        figsize=(5,5),
                                        remove_intra_tissue_connectivity=True,
                                        title_suffix="test NO intra")

maps_head_train_NO_intra = plot_all_maps(embed_output=output_head_train,
                                         cmap=plt.cm.inferno,
                                         figsize=(5,5),
                                         remove_intra_tissue_connectivity=True,
                                         title_suffix="train NO intra")

for tmp in maps_head_test_intra:
    neptune_logger.log_image("maps_head_test_intra", tmp)
    
for tmp in maps_head_train_intra:
    neptune_logger.log_image("maps_head_train_intra", tmp)
    
for tmp in maps_head_test_NO_intra:
    neptune_logger.log_image("maps_head_test_NO_intra", tmp)

for tmp in maps_head_train_NO_intra:
    neptune_logger.log_image("maps_head_train_NO_intra", tmp)

print("maps logged into neptune")

# Train a classifier to find the tissue labels

In [None]:
output_head_test.embeddings.shape

In [None]:
import numpy
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split


X = output_head_test.embeddings.cpu().numpy() 
y = numpy.array(output_head_test.fnames)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
    
classifier = MLPClassifier(random_state=1, hidden_layer_sizes=2, max_iter=30000, tol=1E-8, n_iter_no_change=100)
classifier.fit(X_train, y_train) 
y_predicted = classifier.predict(X_test)
accuracy = numpy.mean((y_test == y_predicted).astype(float))
print("accuracy ->", accuracy)
plt.plot(classifier.loss_curve_)

In [None]:
X = output_head_train.embeddings.cpu().numpy() 
y = numpy.array(output_head_test.fnames)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
    
classifier = MLPClassifier(random_state=1, hidden_layer_sizes=2, max_iter=30000, tol=1E-6, n_iter_no_change=100)
classifier.fit(X_train, y_train) 
y_predicted = classifier.predict(X_test)
accuracy = numpy.mean((y_test == y_predicted).astype(float))
print("accuracy ->", accuracy)
plt.plot(classifier.loss_curve_)

# STOP HERE

### Save the trained model locally and in Neptune

In [None]:
#os.makedirs(model_folder, exist_ok=True)
#torch.save(model.state_dict(), os.path.join(model_folder, "simclr_model.pt"))
#neptune_logger.log_artifact(os.path.join(model_folder, "simclr_model.pt"))

## TensorFlow Projector

In [None]:
#embeddings=embeddings_bbone_test
#tissue_list = tissue_list_test
#condition_list=condition_list_test
#morant_value=morant_value_test

In [None]:
#import numpy
#
#tp.evaluation_utils.create_projector(
#    testloader, embeddings, {"tissue": tissue_list,
#                             "condition": condition_list, 
#                             "moran": morant_value.tolist()}, apply_compress=False
#)

In [None]:
#%load_ext tensorboard

In [None]:
#!tensorboard --logdir ./projector --bind_all

## Create a dataset to train a Linear Classifier by subsampling and encoding

In [None]:
#all_df = [df_wt1, df_wt2, df_wt3, df_dis1, df_dis2, df_dis3]
#all_labels_sparse_images = [0, 0, 0, 1, 1, 1]
#all_names_sparse_images = ["wt1", "wt2", "wt3", "dis1", "dis2", "dis3"]
#all_sparse_images = [tp.data_utils.SparseImage.from_panda(
#      df, "x", "y", category="cell_type", pixel_size=config["simulation"]["PIXEL_SIZE"], padding=10) 
#                   for df in all_df]

#config["simulation"]["N_CROPS_TEST_FOR_TISSUE"] = 24
#
#index = [0,1,3,4]
#suffix = "_".join([str(tmp) for tmp in index])
#name = "classifier_data_"+suffix+".pt"
#saved_file = os.path.join(model_folder, name)
#print("working on ->", saved_file)
#
#dataloader = define_testloader([all_sparse_images[i] for i in index],
#                               [all_labels_sparse_images[i] for i in index],
#                               [all_names_sparse_images[i] for i in index],
#                               config,
#                               use_test_transform=True)
#
#output  = encoder.embed_by_backbone(dataloader, 
#                                    sparse_tensor_callback = SpatialAutocorrelation(modality='moran',
#                                                                                    n_neighbours=6,
#                                                                                    radius=None,
#                                                                                    neigh_correct=False))
#
#torch.save(output, saved_file)
#print("saved the file ->", saved_file)

In [None]:
#torch.load("../trained_model/classifier_data_0_1_3_4.pt")