# Train a `bioLORD` model with `developing human immune across tissue` for `bioLORD` (B-cells)

The data was generated by Suo et al.[[1]](https://www.science.org/doi/full/10.1126/science.abo0510) and downloaded from [Lymphoid cells](https://cellgeni.cog.sanger.ac.uk/developmentcellatlas/fetal-immune/PAN.A01.v01.raw_count.20210429.LYMPHOID.embedding.h5ad). <br>
The complete dataset contains a cross-tissue single-cell atlas of developing human immune cells across prenatal hematopoietic, lymphoid, and nonlymphoid peripheral organs. This includes over 900,000 cells from which we identified over 100 cell states.

[[1] Suo, Chenqu, Emma Dann, Issac Goh, Laura Jardine, Vitalii Kleshchevnikov, Jong-Eun Park, Rachel A. Botting et al. "Mapping the developing human immune system across organs." Science (2022): eabo0510.](https://www.science.org/doi/full/10.1126/science.abo0510)


In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import os
import sys
sys.path.append("/cs/usr/bar246802/bar246802/SandBox2023/biolord_immune_bcells/utils") # add utils
sys.path.append("/cs/usr/bar246802/bar246802/SandBox2023/biolord") # set path)

In [4]:
import biolord
import scanpy as sc
import anndata
import numpy as np
import pandas as pd
from os.path import exists
import torch
import umap.plot
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from cluster_analysis import *
from formatters import *

[rank: 0] Global seed set to 0


In [5]:
print(f"PyTorch version: {torch.__version__}")
# Set the device      
device = "gpu" if torch.backends.cuda.is_built() else "cpu"
print(f"Using device: {device}")

PyTorch version: 1.11.0
Using device: gpu


In [6]:
from tqdm import tqdm
tqdm(disable=True, total=0)  # initialise internal lock

<tqdm.std.tqdm at 0x7f10fd519790>

In [7]:
import mplscience
mplscience.set_style()

plt.rcParams['legend.scatterpoints'] = 1

## Set parameters

In [8]:
DATA_DIR = "../data/"
SAVE_DIR = "../output/"
FIG_DIR = "../figures/"
LOGS_CSV = SAVE_DIR + "trained_models_scores.csv"

## Import processed data

In [None]:
adata = sc.read(DATA_DIR + "2_biolord_immune_bcells_bm.h5ad")

In [None]:
adata.obs["split"].value_counts()

In [None]:
def cluster_evaluate_figures(attribute_):
    ground_truth_labels = np.array(df[attribute_ + '_key'])
    print("Number of samples:", ground_truth_labels.size)
    title = "Attribute: " + attribute_ 
    path = FIG_DIR + attribute_ + "_"
    scores = matrices_figures(transf_embeddings_attributes, ground_truth_labels, df,
                        attributes_map_rev, attribute_, title, path)

In [None]:
def cluster_evaluate(model, id_, attributes = ['celltype', 'organ']):
    transf_embeddings_attributes, df = get_transf_embeddings_attributes(model)
    attributes_ground_truth_labels = {'attributes': [], 'true_labels': []}
    for attribute in attributes:
        ground_truth_labels = np.array(df[attribute + '_key'])
        attributes_ground_truth_labels['attributes'].append(attribute)
        attributes_ground_truth_labels['true_labels'].append(ground_truth_labels)
        ground_truth_unique_labels = list(set(ground_truth_labels))
        print(f'For attribute {attribute} the # of unique true labels is: {len(ground_truth_unique_labels)}')

    path = SAVE_DIR + "kmeans_models_scores.csv"
    n_clusters_range = np.arange(2, 16).astype(int)
    scores = get_kmeans_score(transf_embeddings_attributes, attributes_ground_truth_labels, n_clusters_range=n_clusters_range, id_=id_, save_path=path)
    cols = ['score_name', 'score', 'n_clusters']
    all_scores = scores[cols]
    print(all_scores)
    return all_scores

In [None]:
def split_adata_into_train_test():
    from sklearn.model_selection import train_test_split
    adata.obs['split'] = 'nan'
    ood_samples = adata.obs.sample(frac = 0.0025, random_state=42).index
    adata.obs.loc[ood_samples, "split"] = 'ood'

    adata_idx = adata.obs_names[adata.obs["split"] != 'ood']
    adata_idx_train, adata_idx_test = train_test_split(adata_idx, test_size=0.1, random_state=42)
    adata.obs.loc[adata_idx_train, "split"] = 'train'
    adata.obs.loc[adata_idx_test, "split"] = 'test'
    a = adata.obs['split'].value_counts()
    print("Simaple value count of train, test, OOD:")
    print(a)
    print("\n")
    print("Train, test, OOD by percentage:")
    p = adata.obs['split'].value_counts(normalize=True) * 100
    print(p)

In [None]:
def train_model(module_params, trainer_params):
    # before each train we wish to re-split the data to make sure we are not biased to a certain split
    split_adata_into_train_test()
    model = biolord.Biolord(
        adata=adata,
        n_latent=32,
        model_name="immune_bcells",
        module_params=module_params,
        train_classifiers=False,
        split_key="split",
    )

    model.train(max_epochs=1000,
            use_gpu=True,
            batch_size=512,
            plan_kwargs=trainer_params,
            early_stopping=True,
            early_stopping_patience=20,
            check_val_every_n_epoch=10,
            enable_checkpointing=False,
            num_workers=1)
    return model

In [None]:
def get_model_id():
    id_ = 1
    if exists(LOGS_CSV):
        df_logs = pd.read_csv(LOGS_CSV)
        id_ = df_logs['id_'].max()
        if str(id_).isnumeric():
            id_ += 1
        else:
            id_ = 1
    return id_

In [None]:
arr_n_latent_attribute_categorical = 2 ** np.arange(4, 8)
arr_reconstruction_penalty = [1e-1, 1e1, 1e2, 1e3]
arr_unknown_attribute_penalty = [1e-1, 1e1, 1e2, 1e3]
arr_unknown_attribute_noise_param = [1e-1, 1e1, 1e2, 1e3]

parms_combos = itertools.product(arr_n_latent_attribute_categorical,
                                 arr_reconstruction_penalty,
                                 arr_unknown_attribute_penalty,
                                 arr_unknown_attribute_noise_param)
for i, n in enumerate(parms_combos):
    if i < 10:
        continue
    print(n, 'i=', i)

In [None]:
def model_training_iterations():
    # arr_n_latent_attribute_categorical = np.concatenate(
    #     (np.arange(3, 5, 1), np.arange(5, 31, 5)))
    # arr_reconstruction_penalty = [1e1, 1e2, 1e3]
    # arr_unknown_attribute_penalty = [1e-2, 1e-1, 1e1]
    # arr_unknown_attribute_noise_param = [1e-2, 1e-1, 1e1]

    arr_n_latent_attribute_categorical = 2 ** np.arange(4, 8)
    arr_reconstruction_penalty = [1e-1, 1e1, 1e2, 1e3]
    arr_unknown_attribute_penalty = [1e-1, 1e1, 1e2, 1e3]
    arr_unknown_attribute_noise_param = [1e-1, 1e1, 1e2, 1e3]
    models_of_interests = [51, 199, 254]
    
    parms_combos = itertools.product(arr_n_latent_attribute_categorical,
                                     arr_reconstruction_penalty,
                                     arr_unknown_attribute_penalty,
                                     arr_unknown_attribute_noise_param)
    id_ = get_model_id()
    for i, (n_latent_attribute_categorical, reconstruction_penalty,
            unknown_attribute_penalty, unknown_attribute_noise_param
             ) in enumerate(parms_combos):
        if (i + 1 < id_) or i not in models_of_interests:
            continue
        print(
            f'n_latent_attribute_categorical = {n_latent_attribute_categorical}, reconstruction_penalty = {reconstruction_penalty},unknown_attribute_penalty = {unknown_attribute_penalty}, unknown_attribute_noise_param = {unknown_attribute_noise_param}, i={i+1}'
        )

        biolord.Biolord.setup_anndata(
            adata,
            categorical_attributes_keys=["celltype", "organ", "age"],
            retrieval_attribute_key="sex",
        )

        module_params = {
            "autoencoder_width": 128,
            "autoencoder_depth": 2,
            "attribute_nn_width": 256,
            "attribute_nn_depth": 2,
            "n_latent_attribute_categorical": n_latent_attribute_categorical,
            "loss_ae": "gauss",
            "loss_ordered_attribute": "gauss",
            "reconstruction_penalty": reconstruction_penalty,
            "unknown_attribute_penalty": unknown_attribute_penalty,
            "unknown_attribute_noise_param": unknown_attribute_noise_param,
            "attribute_dropout_rate": 0.1,
            "use_batch_norm": False,
            "use_layer_norm": False,
            "seed": 42,
        }

        trainer_params = {
            "n_epochs_warmup": 0,
            "autoencoder_lr": 1e-4,
            "autoencoder_wd": 1e-4,
            "attribute_nn_lr": 1e-2,
            "attribute_nn_wd": 4e-8,
            "step_size_lr": 45,
            "cosine_scheduler": True,
            "scheduler_final_lr": 1e-5,
        }
        model = train_model(module_params, trainer_params)
        scores = cluster_evaluate(model, id_)
        scores[
            'n_latent_attribute_categorical'] = n_latent_attribute_categorical
        scores['reconstruction_penalty'] = reconstruction_penalty
        scores['unknown_attribute_penalty'] = unknown_attribute_penalty
        scores['unknown_attribute_noise_param'] = unknown_attribute_noise_param
        scores['id_'] = id_
        scores = pd.DataFrame(scores)
        model.save(SAVE_DIR + "trained_model_" + str(id_), overwrite=True)
        if id_ == 1 or not exists(LOGS_CSV):
            scores.to_csv(LOGS_CSV)
        else:
            scores.to_csv(LOGS_CSV, mode='a', header=False)
        id_ += 1

In [None]:
model_training_iterations()

## Export genes from raw adata

In [None]:
raw_file_name = "biolord_immune_bcells_bm_raw"
adata_raw = sc.read(DATA_DIR + "biolord_immune_bcells_bm.h5ad")

In [None]:
print(adata_raw)

In [None]:
unique_genes_raw = set(adata_raw.var['GeneName'])
print(f"numer of unique genes in raw adata file is: {len(unique_genes_raw)}")
df_unique_genes_raw = pd.DataFrame(unique_genes_raw, columns=['GeneName']).sort_values(by=['GeneName'], ascending=True)
print(df_unique_genes_raw)
df_unique_genes_raw.to_csv(SAVE_DIR + "unique_genes_raw_data.csv", index=False)

In [None]:
unique_genes = set(adata.var['GeneName'])
print(f"numer of unique genes in filtered adata file is: {len(unique_genes)}")
df_unique_genes = pd.DataFrame(unique_genes, columns=['GeneName']).sort_values(by=['GeneName'], ascending=True)
print(df_unique_genes)
df_unique_genes.to_csv(SAVE_DIR + "unique_genes_flt_data.csv", index=False)

In [None]:
adata.var[['GeneID', 'GeneName']].to_csv(SAVE_DIR + "unique_genes_flt_data.csv", index=False)