In [1]:
from scdataloader.utils import translate
import scanpy as sc
from anndata.experimental import concat_on_disk
from umap import UMAP
import numpy as np

import bionty as bt
from scdataloader.utils import get_all_ancestors

import matplotlib

import datamapplot

[92m→[0m connected lamindb: jkobject/scprint2


In [2]:
data_directory = "/pasteur/appa/scratch/jkalfon/45322258/"
name = "18hebyht"

In [3]:
adata = sc.read_h5ad(data_directory + name + "_predict.h5ad")  # , backed="r")
adata

AnnData object with n_obs × n_vars = 16406560 × 0
    obs: 'pred_cell_type_ontology_term_id', 'pred_tissue_ontology_term_id', 'pred_disease_ontology_term_id', 'pred_age_group', 'pred_assay_ontology_term_id', 'pred_self_reported_ethnicity_ontology_term_id', 'pred_sex_ontology_term_id', 'pred_organism_ontology_term_id', 'pred_cell_culture', 'conv_pred_cell_type_ontology_term_id', 'conv_pred_tissue_ontology_term_id', 'conv_pred_disease_ontology_term_id', 'conv_pred_age_group', 'conv_pred_assay_ontology_term_id', 'conv_pred_self_reported_ethnicity_ontology_term_id', 'leiden_1.0'
    uns: 'leiden_1.0', 'neighbors', 'umap'
    obsm: 'X_umap', 'scprint_emb_cell_type_ontology_term_id'
    obsp: 'connectivities', 'distances'

In [4]:
res = {}
subres = {}
for i in [
    "pred_sex_ontology_term_id",
    "pred_cell_culture",
    "pred_organism_ontology_term_id",
    "conv_pred_cell_type_ontology_term_id",
    "conv_pred_self_reported_ethnicity_ontology_term_id",
]:
    print(i)
    res[i] = adata.obs[i].value_counts()
    print(len(res[i]))
    subres[i] = {j: j if k > 150 else "other" for j, k in res[i].items()}
    print(len(set(subres[i].values())))

pred_sex_ontology_term_id


2
2
pred_cell_culture


2
2
pred_organism_ontology_term_id


16
16
conv_pred_cell_type_ontology_term_id


337
254
conv_pred_self_reported_ethnicity_ontology_term_id


16
16


In [5]:
rt = translate(
    res["pred_organism_ontology_term_id"].keys(), "organism_ontology_term_id"
)

In [6]:
rt = {j: rt[i] for i, j in enumerate(res["pred_organism_ontology_term_id"].keys())}

In [7]:
rt["NCBITaxon:9483"] = "marmoset"
rt["NCBITaxon:7227"] = "drosophila"
rt["NCBITaxon:3702"] = "arabidopsis"
rt["NCBITaxon:4577"] = "maize"
rt["NCBITaxon:6239"] = "c. elegans"
rt["NCBITaxon:9940"] = "sheep"

In [8]:
adata.obs["conv_pred_organism_ontology_term_id"] = adata.obs[
    "pred_organism_ontology_term_id"
].replace(rt)

  ].replace(rt)


In [9]:
subsubres = {}
for i in [
    "conv_pred_cell_type_ontology_term_id",
]:
    print(i)
    res[i] = adata.obs[i].value_counts()
    print(len(res[i]))
    subsubres[i] = {j: j if k > 1500 else "other" for j, k in res[i].items()}
    print(len(set(subsubres[i].values())))

conv_pred_cell_type_ontology_term_id


337
183


In [10]:
subres["conv_pred_assay_ontology_term_id"] = {
    "10x 3' v2": "10x 3'",
    "10x 3' v3": "10x 3'",
    "10x 3' v1": "10x 3'",
    "10x immune profiling": "10x 3'",
    "10x multiome": "10x 3'",
    "Smart-seq v4": "Smart-seq",
    "Smart-seq2": "Smart-seq",
    "10x 5' v2": "10x 5'",
    "10x 5' v1": "10x 5'",
}

In [11]:
MAX = 3_500_000

In [12]:
plot = datamapplot.create_interactive_plot(
    adata[:MAX].obsm["X_umap"],
    adata[:MAX].obs["conv_pred_organism_ontology_term_id"],
    adata[:MAX]
    .obs["conv_pred_cell_type_ontology_term_id"]
    .replace(subsubres["conv_pred_cell_type_ontology_term_id"]),
    noise_label="other",
    colormap_rawdata=[
        adata[:MAX]
        .obs["conv_pred_cell_type_ontology_term_id"]
        .replace(subres["conv_pred_cell_type_ontology_term_id"])
        .values,
        adata[:MAX].obs["conv_pred_organism_ontology_term_id"].values,
        adata[:MAX]
        .obs["conv_pred_assay_ontology_term_id"]
        .replace(subres["conv_pred_assay_ontology_term_id"])
        .values,
        adata[:MAX].obs["pred_cell_culture"].values,
    ],
    colormap_metadata=[
        {
            "field": "cell type",
            "description": "Predicted Cell Type",
            "cmap": "gist_rainbow",
            "kind": "categorical",
            "n_colors": len(subres["conv_pred_cell_type_ontology_term_id"]),
            # palete
        },
        # organism
        {
            "field": "organism",
            "description": "Predicted Organism",
            "cmap": "tab20",
            "kind": "categorical",
            "n_colors": len(subres["pred_organism_ontology_term_id"]),
        },
        # assay
        {
            "field": "assay",
            "description": "Predicted Assay",
            "cmap": "tab20",
            "kind": "categorical",
            "n_colors": len(subres["conv_pred_assay_ontology_term_id"]),
        },
        # cell culture
        {
            "field": "culture",
            "description": "Predicted Cellular Culture",
            "cmap": "Set3",
            "kind": "categorical",
            "n_colors": len(subres["pred_cell_culture"]),
        },
    ],
    hover_text=(
        "organism: "
        + adata[:MAX].obs["conv_pred_organism_ontology_term_id"].astype(str)
        + "\ncell type: "
        + adata[:MAX].obs["conv_pred_cell_type_ontology_term_id"].astype(str)
        + "\ncellular culture: "
        + adata[:MAX].obs["pred_cell_culture"].astype(str)
        + "\nassay: "
        + adata[:MAX].obs["conv_pred_assay_ontology_term_id"].astype(str)
        + "\ntissue of origin: "
        + adata[:MAX].obs["conv_pred_tissue_ontology_term_id"].astype(str)
        + "\ndisease of origin: "
        + adata[:MAX].obs["conv_pred_disease_ontology_term_id"].astype(str)
        + "\nage: "
        + adata[:MAX].obs["conv_pred_age_group"].astype(str)
        + "\nsex: "
        + adata[:MAX].obs["pred_sex_ontology_term_id"].astype(str)
        + "\n ethnicity: "
        + adata[:MAX]
        .obs["conv_pred_self_reported_ethnicity_ontology_term_id"]
        .astype(str)
    ),
    font_family="DejaVu Sans",
    # cmap="viridis",
    enable_search=True,
    inline_data=False,
    use_medoids=True,
    initial_zoom_fraction=0.5,
)

  .replace(subsubres["conv_pred_cell_type_ontology_term_id"]),


  .replace(subres["conv_pred_cell_type_ontology_term_id"])


  .replace(subres["conv_pred_assay_ontology_term_id"])


  return fit_method(estimator, *args, **kwargs)


In [13]:
plot.save("./nice_umap_scprint3.html")