# Train a `bioLORD` model with `developing human immune across tissue` for `bioLORD` (B-cells)

The data was generated by Suo et al.[[1]](https://www.science.org/doi/full/10.1126/science.abo0510) and downloaded from [Lymphoid cells](https://cellgeni.cog.sanger.ac.uk/developmentcellatlas/fetal-immune/PAN.A01.v01.raw_count.20210429.LYMPHOID.embedding.h5ad). <br>
The complete dataset contains a cross-tissue single-cell atlas of developing human immune cells across prenatal hematopoietic, lymphoid, and nonlymphoid peripheral organs. This includes over 900,000 cells from which we identified over 100 cell states.

[[1] Suo, Chenqu, Emma Dann, Issac Goh, Laura Jardine, Vitalii Kleshchevnikov, Jong-Eun Park, Rachel A. Botting et al. "Mapping the developing human immune system across organs." Science (2022): eabo0510.](https://www.science.org/doi/full/10.1126/science.abo0510)


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append("/cs/usr/bar246802/bar246802/SandBox2023/biolord_immune_bcells/utils") # add utils
sys.path.append("/cs/usr/bar246802/bar246802/SandBox2023/biolord") # set path)

In [3]:
import biolord
import scanpy as sc
import anndata
import numpy as np
import pandas as pd
import torch
import umap.plot
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from cluster_analysis import *
from formatters import *

[rank: 0] Global seed set to 0


In [4]:
print(f"PyTorch version: {torch.__version__}")
# Set the device      
device = "gpu" if torch.backends.cuda.is_built() else "cpu"
print(f"Using device: {device}")

PyTorch version: 1.13.1+cu117
Using device: gpu


In [5]:
from tqdm import tqdm
tqdm(disable=True, total=0)  # initialise internal lock

<tqdm.std.tqdm at 0x7f6de0d417f0>

In [6]:
import mplscience
mplscience.set_style()

plt.rcParams['legend.scatterpoints'] = 1

## Set parameters

In [7]:
DATA_DIR = "../data/"
SAVE_DIR = "../output/"
FIG_DIR = "../figures/"

## Import processed data

In [8]:
adata = sc.read(DATA_DIR + "biolord_immune_bcells_bm.h5ad")

In [9]:
adata.obs["split"].value_counts()

train    57436
test      6382
ood        160
Name: split, dtype: int64

In [None]:
biolord.Biolord.setup_anndata(
    adata,
    categorical_attributes_keys=["celltype", "organ", "age"],
    retrieval_attribute_key="sex",
)

## Train model

In [None]:
# reconstruction_penalty
N_LATENT_ATTRIBUTE_CATEGORICAL = 4
module_params = {
    "autoencoder_width": 128,
    "autoencoder_depth": 2,
    "attribute_nn_width": 256,
    "attribute_nn_depth": 2,
    "n_latent_attribute_categorical": N_LATENT_ATTRIBUTE_CATEGORICAL,
    "loss_ae": "gauss",
    "loss_ordered_attribute": "gauss",
    "reconstruction_penalty": 1e2,
    "unknown_attribute_penalty": 1e1,
    "unknown_attribute_noise_param": 1e-1,
    "attribute_dropout_rate": 0.1,
    "use_batch_norm": False,
    "use_layer_norm": False,
    "seed": 42,
}


trainer_params = {
    "n_epochs_warmup": 0,
    "autoencoder_lr": 1e-4,
    "autoencoder_wd": 1e-4,
    "attribute_nn_lr": 1e-2,
    "attribute_nn_wd": 4e-8,
    "step_size_lr": 45,
    "cosine_scheduler": True,
    "scheduler_final_lr": 1e-5,
}

In [None]:
model = biolord.Biolord(
    adata=adata,
    n_latent=32,
    model_name="immune_bcells",
    module_params=module_params,
    train_classifiers=False,
    split_key="split",
)

In [None]:
model.train(max_epochs=1000,
            batch_size=512,
            plan_kwargs=trainer_params,
            early_stopping=True,
            early_stopping_patience=20,            
            check_val_every_n_epoch=10,
            num_workers=1)

## Evaluate the trained model

In [None]:
size = N_LATENT_ATTRIBUTE_CATEGORICAL
vals = [
    "generative_mean_accuracy", "generative_var_accuracy", "biolord_metric"
]
fig, axs = plt.subplots(nrows=1,
                        ncols=len(vals),
                        figsize=(size * len(vals), size))

model.epoch_history = pd.DataFrame().from_dict(
    model.training_plan.epoch_history)
for i, val in enumerate(vals):
    sns.lineplot(
        x="epoch",
        y=val,
        hue="mode",
        data=model.epoch_history[model.epoch_history["mode"] == "valid"],
        ax=axs[i],
    )

plt.tight_layout()
plt.show()

## Save the trained model

In [None]:
model.save(SAVE_DIR + "trained_model")

## Load saved model

In [16]:
model = biolord.Biolord.load(dir_path=SAVE_DIR + "trained_model", adata=adata, use_gpu=True)

[34mINFO    [0m File ..[35m/output/trained_model/[0m[95mmodel.pt[0m already downloaded                                                  


[rank: 0] Global seed set to 42


## Assess performance

In [None]:
idx_train = np.where(
    (adata.obs["split"] == "train") 
)[0]

adata_train= adata[idx_train].copy()


idx_test = np.where(
    (adata.obs["split"] == "test")
)[0]

adata_test = adata[idx_test].copy()

dataset_train = model.get_dataset(adata_train)
dataset_test = model.get_dataset(adata_test)

## Cluster embeddings

In [17]:
attributes_map = {
    "celltype": model.categorical_attributes_map["celltype"],
    "organ": model.categorical_attributes_map["organ"]
}

In [20]:
transf_embeddings_attributes = {
    attribute_:
    model.get_categorical_attribute_embeddings(attribute_key=attribute_)
    for attribute_ in model.categorical_attributes_map
}

keys = list(
    itertools.product(*[
        list(model.categorical_attributes_map[attribute_].keys())
        for attribute_ in model.categorical_attributes_map
    ]))

transf_embeddings_attributes_dict = {
    "_".join([str(k) for k in key_]): np.concatenate(([
        transf_embeddings_attributes[map_[0]][map_[1][key_[ci]], :]
        for ci, map_ in enumerate(model.categorical_attributes_map.items())
    ]), 0)
    for key_ in keys
}

transf_embeddings_attributes = [
    np.concatenate(([
        transf_embeddings_attributes[map_[0]][map_[1][key_[ci]], :]
        for ci, map_ in enumerate(model.categorical_attributes_map.items())
    ]), 0) for key_ in keys
]

In [21]:
transf_embeddings_attributes_ind = {
    attribute_:
    model.get_categorical_attribute_embeddings(attribute_key=attribute_)
    for attribute_ in attributes_map
}

keys = list(
    itertools.product(*[
        list(model.categorical_attributes_map[attribute_].keys())
        for attribute_ in attributes_map
    ]))

transf_embeddings_attributes_dict = {
    "_".join([str(k) for k in key_]): np.concatenate(([
        transf_embeddings_attributes_ind[map_[0]][map_[1][key_[ci]], :]
        for ci, map_ in enumerate(attributes_map.items())
    ]), 0)
    for key_ in keys
}

transf_embeddings_attributes = [
    np.concatenate(([
        transf_embeddings_attributes_ind[map_[0]][map_[1][key_[ci]], :]
        for ci, map_ in enumerate(attributes_map.items())
    ]), 0) for key_ in keys
]

In [22]:
transf_embeddings_attributes = np.asarray(transf_embeddings_attributes)

In [None]:
pca = sc.tl.pca(transf_embeddings_attributes)

In [23]:
mapper_latent = umap.UMAP().fit_transform(transf_embeddings_attributes)

In [24]:
cols = {
    attribute_ :  [key_[ci] for key_ in keys]
    for ci, attribute_ in enumerate(attributes_map)
}

In [None]:
adata.uns["organ_colors"] = [
    "#029e73", 
    "#949494",
    "#ece133",
    "#de8f05",
    "#ca9161",
    "#fbafe4",
    "#cc78bc",
    "#d55e00",
    "#0173b2",
]

In [25]:
df = pd.DataFrame(mapper_latent, columns=["umap1", "umap2"])
for i in range(pca.shape[1]):
    df[f"pc{i+1}"] = pca[:, i]
for col_, map_ in cols.items():
    df[col_] = map_
    

dfs = {}

for attribute_ in transf_embeddings_attributes_ind:
    dfs[attribute_] = pd.DataFrame(
        transf_embeddings_attributes_ind[attribute_],
        columns=["latent1", "latent2", "latent3", "latent4"])
    dfs[attribute_][attribute_] = list(attributes_map[attribute_].keys())
    dfs[attribute_][attribute_ + '_key'] = list(attributes_map[attribute_].values())
    df[attribute_ + "_key"] = df[attribute_].map(attributes_map[attribute_])

NameError: name 'pca' is not defined

In [31]:
attributes_map

{'celltype': {'B1': 0,
  'CYCLING_B': 1,
  'IMMATURE_B': 2,
  'LARGE_PRE_B': 3,
  'LATE_PRO_B': 4,
  'MATURE_B': 5,
  'PLASMA_B': 6,
  'PRE_PRO_B': 7,
  'PRO_B': 8,
  'SMALL_PRE_B': 9},
 'organ': {'BM': 0,
  'GU': 1,
  'KI': 2,
  'LI': 3,
  'MLN': 4,
  'SK': 5,
  'SP': 6,
  'TH': 7,
  'YS': 8}}

In [32]:
df = {}
cols = {
    attribute_: [key_[ci] for key_ in keys]
    for ci, attribute_ in enumerate(attributes_map)
}
for col_, map_ in cols.items():
    df[col_] = map_
df = pd.DataFrame(df)
for attribute_ in transf_embeddings_attributes_ind:
    print(df[attribute_])
    df[attribute_ + "_key"] = df[attribute_].map(attributes_map[attribute_])

0              B1
1              B1
2              B1
3              B1
4              B1
         ...     
85    SMALL_PRE_B
86    SMALL_PRE_B
87    SMALL_PRE_B
88    SMALL_PRE_B
89    SMALL_PRE_B
Name: celltype, Length: 90, dtype: object
0      BM
1      GU
2      KI
3      LI
4     MLN
     ... 
85    MLN
86     SK
87     SP
88     TH
89     YS
Name: organ, Length: 90, dtype: object


In [33]:
df

Unnamed: 0,celltype,organ,celltype_key,organ_key
0,B1,BM,0,0
1,B1,GU,0,1
2,B1,KI,0,2
3,B1,LI,0,3
4,B1,MLN,0,4
...,...,...,...,...
85,SMALL_PRE_B,MLN,9,4
86,SMALL_PRE_B,SK,9,5
87,SMALL_PRE_B,SP,9,6
88,SMALL_PRE_B,TH,9,7


In [None]:
df["celltype"] = df["celltype"].replace(
    {
        "B1": "B1", 
        "CYCLING_B": "cycling B", 
        "IMMATURE_B": "immature B", 
        "LARGE_PRE_B": "large pre B", 
        "LATE_PRO_B": "late pro B", 
        "MATURE_B": "mature B", 
        "PLASMA_B": "palsma B", 
        "PRE_PRO_B": "pre pro B", 
        "PRO_B": "pro B", 
        "SMALL_PRE_B": "small pre B", 
    }
)

In [None]:
df["organ"] = df["organ"].replace(
    {
        "BM": "Bone Marrow", 
        "GU": "Gut", 
        "KI": "Kidney", 
        "LI": "Liver", 
        "MLN": "Lymph Node", 
        "SK": "Skin", 
        "SP": "Spleen", 
        "TH": "Thymus", 
        "YS": "Yolk Sac",
    }
)

In [None]:
fig, axs = plt.subplots(1,1, figsize=(8,4))


sns.scatterplot(
    data=df, 
    x="umap1", 
    y="umap2", 
    hue="celltype",
    style="organ",
    ax=axs,  
    alpha=.8,
    s=60,
    palette="deep"
)



axs.set_title("cell type")
axs.set(xticklabels=[], yticklabels=[])
axs.set_xlabel("UMAP1")
axs.set_ylabel("UMAP2")
axs.grid(False)
axs.legend(loc="upper left", bbox_to_anchor=(1, 1), ncols=2)

plt.tight_layout()
plt.savefig(FIG_DIR + "cell_type.png", format="png", dpi=300)

plt.show()


In [None]:
SAVEFIG = False
fig, axs = plt.subplots(1,1, figsize=(8,4))

sns.scatterplot(
    data=df, 
    x="umap1", 
    y="umap2", 
    hue="organ",
    style="celltype",
    ax=axs,  
    alpha=.8,
    s=60,
    palette=[
    "#029e73", 
    "#949494",
    "#ece133",
    "#de8f05",
    "#ca9161",
    "#fbafe4",
    "#cc78bc",
    "#d55e00",
    "#0173b2",
]
)


axs.set_title("organ")
axs.set(xticklabels=[], yticklabels=[])
axs.set_xlabel("UMAP1")
axs.set_ylabel("UMAP2")
axs.grid(False)
axs.legend(loc="upper left", bbox_to_anchor=(1, 1), ncols=2)

plt.tight_layout()
plt.savefig(FIG_DIR + "organ.png", format="png", dpi=300)

plt.show()


In [None]:
attributes_map_rev = {}
for key in attributes_map:
    attributes_map_rev[key] = {v: k for k, v in attributes_map[key].items()}

In [None]:
attributes_map_rev

In [10]:
def cluster_evaluate_figures(attribute_):
    ground_truth_labels = np.array(df[attribute_ + '_key'])
    print("Number of samples:", ground_truth_labels.size)
    title = "Attribute: " + attribute_ 
    path = FIG_DIR + attribute_ + "_"
    scores = matrices_figures(transf_embeddings_attributes, ground_truth_labels, df,
                        attributes_map_rev, attribute_, title, path)

In [None]:
cluster_evaluate_figures("celltype")

In [None]:
cluster_evaluate_figures("organ")

In [11]:
def cluster_evaluate(model, attributes = ['celltype', 'organ']):
    transf_embeddings_attributes, df = get_transf_embeddings_attributes(model)
    all_scores = None
    for attribute in attributes:
        ground_truth_labels = np.array(df[attribute + '_key'])
        ground_truth_unique_labels = list(set(ground_truth_labels))
        print(f'For attribute {attribute} the # of unique true labels is: {len(ground_truth_unique_labels)}')
        scores = get_kmeans_score(transf_embeddings_attributes, ground_truth_labels)
        scores['attribute'] = attribute
        if all_scores is not None:
            all_scores = pd.concat([all_scores, scores], ignore_index=True)
        else:
            all_scores = scores
    cols = ['attribute', 'score_name', 'score', 'n_clusters']
    all_scores = all_scores[cols]
    print(all_scores)
    return all_scores

In [19]:
scores = cluster_evaluate(model)

{'celltype': ['B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'CYCLING_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'IMMATURE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LARGE_PRE_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'LATE_PRO_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'MATURE_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PLASMA_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRE_PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'PRO_B', 'SMALL_PRE_B',

AttributeError: 'list' object has no attribute 'map'

In [12]:
def train_model(module_params, trainer_params):   
    model = biolord.Biolord(
        adata=adata,
        n_latent=32,
        model_name="immune_bcells",
        module_params=module_params,
        train_classifiers=False,
        split_key="split",
    )
    
    model.train(max_epochs=1000,
            use_gpu=True,
            batch_size=512,
            plan_kwargs=trainer_params,
            early_stopping=True,
            early_stopping_patience=20,            
            check_val_every_n_epoch=10,
            num_workers=1)
    return model

In [13]:
def model_training_iterations():
    arr_n_latent_attribute_categorical = np.concatenate(
        (np.arange(3, 5), np.arange(5, 31, 5)))
    arr_reconstruction_penalty = [1e2, 1e3]
    arr_unknown_attribute_penalty = [1e-1, 1e1]
    arr_unknown_attribute_noise_param = [1e-1, 1e1]
    id_ = 1
    full_scores = None
    for n_latent_attribute_categorical, reconstruction_penalty, unknown_attribute_penalty, unknown_attribute_noise_param in itertools.product(
            arr_n_latent_attribute_categorical, arr_reconstruction_penalty,
            arr_unknown_attribute_penalty, arr_unknown_attribute_noise_param):
        print(f"loop index is {id_}")
        
        biolord.Biolord.setup_anndata(
            adata,
            categorical_attributes_keys=["celltype", "organ", "age"],
            retrieval_attribute_key="sex",
        )
        
        module_params = {
            "autoencoder_width": 128,
            "autoencoder_depth": 2,
            "attribute_nn_width": 256,
            "attribute_nn_depth": 2,
            "n_latent_attribute_categorical": n_latent_attribute_categorical,
            "loss_ae": "gauss",
            "loss_ordered_attribute": "gauss",
            "reconstruction_penalty": reconstruction_penalty,
            "unknown_attribute_penalty": unknown_attribute_penalty,
            "unknown_attribute_noise_param": unknown_attribute_noise_param,
            "attribute_dropout_rate": 0.1,
            "use_batch_norm": False,
            "use_layer_norm": False,
            "seed": 42,
        }

        trainer_params = {
            "n_epochs_warmup": 0,
            "autoencoder_lr": 1e-4,
            "autoencoder_wd": 1e-4,
            "attribute_nn_lr": 1e-2,
            "attribute_nn_wd": 4e-8,
            "step_size_lr": 45,
            "cosine_scheduler": True,
            "scheduler_final_lr": 1e-5,
        }
        model = train_model(module_params, trainer_params)
        scores = cluster_evaluate(model)
        scores[
            'n_latent_attribute_categorical'] = n_latent_attribute_categorical
        scores['reconstruction_penalty'] = reconstruction_penalty
        scores['unknown_attribute_penalty'] = unknown_attribute_penalty
        scores['unknown_attribute_noise_param'] = unknown_attribute_noise_param
        if full_scores is not None:
            full_scores = pd.concat([full_scores, scores], ignore_index=True)
        else:
            full_scores = scores


#         model.save(SAVE_DIR + "trained_model_" + str(id_), overwrite=True)

        id_ += 1

        full_scores.to_csv(SAVE_DIR + "trained_models_scores.csv")

In [15]:
model_training_iterations()

[rank: 0] Global seed set to 42


loop index is 1


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 104/1000:  10%|█         | 104/1000 [06:58<1:00:05,  4.02s/it, v_num=1, val_generative_mean_accuracy=0.45, val_generative_var_accuracy=0.239, val_biolord_metric=0.345, val_reconstruction_loss=265, val_unknown_attribute_penalty_loss=3.88, generative_mean_accuracy=0, generative_var_accuracy=0, biolord_metric=0, reconstruction_loss=31.2, unknown_attribute_penalty_loss=4.91]
Monitored metric val_biolord_metric did not improve in the last 20 records. Best score: 0.346. Signaling Trainer to stop.


AttributeError: 'list' object has no attribute 'map'