# Train a `bioLORD` model with `developing human immune across tissue` for `bioLORD` (B-cells)

The data was generated by Suo et al.[[1]](https://www.science.org/doi/full/10.1126/science.abo0510) and downloaded from [Lymphoid cells](https://cellgeni.cog.sanger.ac.uk/developmentcellatlas/fetal-immune/PAN.A01.v01.raw_count.20210429.LYMPHOID.embedding.h5ad). <br>
The complete dataset contains a cross-tissue single-cell atlas of developing human immune cells across prenatal hematopoietic, lymphoid, and nonlymphoid peripheral organs. This includes over 900,000 cells from which we identified over 100 cell states.

[[1] Suo, Chenqu, Emma Dann, Issac Goh, Laura Jardine, Vitalii Kleshchevnikov, Jong-Eun Park, Rachel A. Botting et al. "Mapping the developing human immune system across organs." Science (2022): eabo0510.](https://www.science.org/doi/full/10.1126/science.abo0510)


In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from os.path import exists
import torch
import umap.plot
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from cluster_analysis import *
from formatters import *

[rank: 0] Global seed set to 0


In [4]:
print(f"PyTorch version: {torch.__version__}")
# Set the device      
device = "gpu" if torch.backends.cuda.is_built() else "cpu"
print(f"Using device: {device}")

PyTorch version: 1.13.1+cu117
Using device: gpu


In [5]:
from tqdm import tqdm
tqdm(disable=True, total=0)  # initialise internal lock

<tqdm.std.tqdm at 0x7f88af01f940>

In [6]:
import mplscience
mplscience.set_style()

plt.rcParams['legend.scatterpoints'] = 1

## Set parameters

In [7]:
DATA_DIR = "../data/"
SAVE_DIR = "../output/"
FIG_DIR = "../figures/"
LOGS_CSV = SAVE_DIR + "trained_models_scores.csv"

In [None]:
def anova():
    df = pd.read_csv("../output/trained_models_scores.csv")
    # reshape the d dataframe suitable for statsmodels package
    cols = ["row_index", "attribute", "score_name", "score", "n_clusters", "n_latent_attribute_categorical",
            "reconstruction_penalty",
            "unknown_attribute_penalty",
            "unknown_attribute_noise_param",
            "id_"]
    # keys_cols = ["row_index", "attribute", "score_name", "n_clusters", "n_latent_attribute_categorical",
    #         "reconstruction_penalty",
    #         "unknown_attribute_penalty",
    #         "unknown_attribute_noise_param",
    #         "id_"]
    # ragelar_cols = ["score"]
    # df_melt = pd.melt(df.reset_index(), id_vars=keys_cols, value_vars=ragelar_cols)

    for score_name in list(set(df['score_name'])):
        fig, axs = plt.subplots(1, 2, figsize=(14, 7), gridspec_kw={"width_ratios": [1, 1]})
        col = row = 0
        title = f'n_latent_attribute_categorical vs. score of metric: {score_name}'
        for attribute in list(set(df['attribute'])):
            df_score = df[(df['score_name'] == score_name) & (df['attribute'] == attribute)]
            sns.boxplot(x='n_latent_attribute_categorical', y='score', data=df_score,
                        color='#99c2a2', ax=axs[col]).set(title=title)
            sns.swarmplot(x="n_latent_attribute_categorical", y="score", data=df_score, color='#7d0013', ax=axs[col])
            axs[col].set_title(f'attribute: {attribute}')
            axs[col].set_xlabel("n_latent_attribute_categorical")
            axs[col].set_ylabel("Score")
            col += 1
        fig.suptitle(title, fontsize=14)
        plt.savefig(f'{score_name}_score_boxplot.png', format="png", dpi=300)
        plt.show()

In [None]:
anova()