# one-off preparation of the database

In [1]:
!lamin init --storage ~/scprint --schema bionty

✅ saved: User(uid='a3keNAVX', handle='jkobject', updated_at=2023-11-22 09:47:07 UTC)
✅ saved: Storage(uid='GZgLW1TI', root='/home/ml4ig1/scprint', type='local', updated_at=2023-11-22 09:47:07 UTC, created_by_id=1)
💡 loaded instance: jkobject/scprint
💡 did not register local instance on hub
[0m

In [1]:
from scprint import data_utils

import lamindb as ln
import lnschema_bionty as lb

%load_ext autoreload
%autoreload 2

💡 lamindb instance: jkobject/scprint


In [2]:
lb.settings.organism = "human"

## prepare lamin database

In [3]:
cx_dataset = ln.Dataset.using("laminlabs/cellxgene-census").one()
cx_dataset

Dataset(uid='EAUF1AaT4kOVyHYnZsUJ', name='cellxgene-census', version='2023-07-25', hash='pEJ9uvIeTLvHkZW2TBT5', visibility=0, updated_at=2023-10-24 16:00:07 UTC, transform_id=1, run_id=9, created_by_id=2)

In [86]:
mydataset = data_utils.load_dataset_local(cx_dataset, "~/scprint/", name="cellxgene-local", description="the full cellxgene database", only=(0,20))

❗ no run & transform get linked, consider passing a `run` or calling ln.track()
❗ record with CAQPyYbcP4UjgiIfniQK already exists on default database: File(uid='CAQPyYbcP4UjgiIfniQK', key='cell-census/2023-07-25/h5ads/d5c67a4e-a8d9-456d-a273-fa01adb1b308.h5ad', suffix='.h5ad', accessor='AnnData', description='Retina|d5c67a4e-a8d9-456d-a273-fa01adb1b308', size=190765719, hash='d8ZhFETfIiYt51mH7_7CtQ-23', hash_type='md5-n', visibility=0, key_is_virtual=False, updated_at=2023-10-16 15:04:08 UTC, storage_id=2, transform_id=1, run_id=1, created_by_id=1)
File cell-census/2023-07-25/h5ads/d5c67a4e-a8d9-456d-a273-fa01adb1b308.h5ad already exists in storage
❗ no run & transform get linked, consider passing a `run` or calling ln.track()
❗ record with U08Kty7hcUhsXiPc86xV already exists on default database: File(uid='U08Kty7hcUhsXiPc86xV', key='cell-census/2023-07-25/h5ads/1492eb6b-7d50-4c4d-94ac-c801a7d5555c.h5ad', suffix='.h5ad', accessor='AnnData', description='Molecular characterization of se

## load some known ontology names

In [3]:
#you can also load it back
mydataset = ln.Dataset.filter(name="cellxgene-local").one()

In [65]:
import cellxgene_census

census = cellxgene_census.open_soma(census_version = "latest")
val_to_get = ['self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id']
df = census["census_data"]["homo_sapiens"].obs.read(column_names=val_to_get, value_filter="is_primary_data == True").concat().to_pandas()
df2 = census["census_data"]["mus_musculus"].obs.read(column_names=val_to_get, value_filter="is_primary_data == True").concat().to_pandas()
df.shape

(39055600, 6)

In [70]:
df2['disease_ontology_term_id'].unique()

array(['PATO:0000461', 'MONDO:0005147', 'MONDO:0005148', 'MONDO:0001933',
       'MONDO:0002052'], dtype=object)

In [67]:
data_utils.populate_my_ontology(lb=lb,
    organisms=["NCBITaxon:10090", "NCBITaxon:9606"],
    sex=["PATO:0000384", "PATO:0000383"],
    ethnicities=df['self_reported_ethnicity_ontology_term_id'].unique().tolist(),
    assays=list(set(df['assay_ontology_term_id'].unique()).union(df2['assay_ontology_term_id'].unique())),
    tissues=list(set(df['tissue_ontology_term_id'].unique()).union(df2['tissue_ontology_term_id'].unique())),
    diseases=list(set(df['disease_ontology_term_id'].unique()).union(df2['disease_ontology_term_id'].unique())),
    dev_stages=list(df['development_stage_ontology_term_id'].unique()),)

❗ now recursing through parents: this only happens once, but is much slower than bulk saving
❗ now recursing through parents: this only happens once, but is much slower than bulk saving


In [None]:
bionty_source_ds_mouse = lb.BiontySource.filter(entity="DevelopmentalStage", organism="mouse").one()
records = lb.DevelopmentalStage.from_values(df2['development_stage_ontology_term_id'].unique().tolist(), field=lb.DevelopmentalStage.ontology_id, bionty_source=bionty_source_ds_mouse)
ln.save(records)

In [72]:
import bionty as bt

In [73]:
names = bt.Disease().df().index
records = lb.Disease.from_values(names, field=lb.Disease.ontology_id)
ln.save(records)

❗ now recursing through parents: this only happens once, but is much slower than bulk saving


In [87]:
assay = ['EFO:0010961']

In [88]:
records = lb.ExperimentalFactor.from_values(assay, field=lb.ExperimentalFactor.ontology_id)
ln.save(records)

❗ now recursing through parents: this only happens once, but is much slower than bulk saving


## add some missing ontology names

In [5]:
from scprint.dataset.utils import get_ancestry_mapping

In [None]:
additional_tissues = {
    "UBERON:0037144": "wall of heart",
    "UBERON:0003929": "digestive tract epithelium",
    "UBERON:0002020": "gray matter",
    "UBERON:0000200": "gyrus",
    "UBERON:0000101": "lobe of lung",
    "UBERON:0001981": "blood vessel",
    "UBERON:0001474": "bone element",
}

additional_diseases = {
    "MONDO:0001106": "kidney failure",
    "MONDO:0021166": "inflammatory disease",
    "MONDO:0004992": "cancer",
    "MONDO:0004994": "cardiomyopathy",
    "MONDO:0700065": "trisomy",
    "MONDO:0021042": "glioma",
    "MONDO:0005265": "inflammatory bowel disease",
    "MONDO:0005550": "infectious disease",
    "MONDO:0005059": "leukemia",
}

additional_assays = {
    "EFO:0010184": "Smart-like",
}

In [None]:
# need to solve issues with CellType


Did it using the code below to figure out things we might want to add etc..

In [None]:
mapping, anc, leafs = get_ancestry_mapping(df['tissue_ontology_term_id'].unique(), lb.Tissue.filter().df(include=["parents__ontology_id"]).set_index("ontology_id"))
# getting only the leaves for which we don't have a parent
leafs = list(leafs - set.union(*[mapping[val] for val in mapping.keys()]))

In [None]:
lb.Tissue.search(list(leafs)[108], field="ontology_id",return_queryset=True).first().view_parents()

## Preprocessing

In [5]:
from scprint.dataset.preprocess import Preprocessor
import scanpy as sc
import numpy as np


In [62]:
def additional_preprocess(adata):
    adata.obs = adata.obs.replace({'self_reported_ethnicity_ontology_term_id':{
        'multiethnic':'unknown',
        'American':'unknown',
        'Jewish Israeli': 'unknown',
        'na':'unknown',
    }}) #multi ethnic will have to get renamed
    return adata

def additional_postprocess(adata):
    # define the "up to" 10 neighbors for each cells and add to obs
    # compute neighbors
    # need to be connectivities and same labels [cell type, assay, dataset, disease]
    # define the "neighbor" up to 10(N) cells and add to obs
    # define the "next time point" up to 5(M) cells and add to obs  # step 1: filter genes
    sc.tl.diffmap(adata)
    # create a meta group
    adata.obs['dpt_group'] = adata.obs['leiden_1'].astype(str) + "_" + adata.obs['disease_ontology_term_id'].astype(str) + "_" + adata.obs['cell_type_ontology_term_id'].astype(str) + "_" + adata.obs['tissue_ontology_term_id'].astype(str) #+ "_" + adata.obs['dataset_id'].astype(str)

    # if group is too small
    okgroup = [i for i, j in adata.obs['dpt_group'].value_counts().items() if j>=10]
    not_okgroup = [i for i, j in adata.obs['dpt_group'].value_counts().items() if j<3]
    # set the group to empty
    adata.obs.loc[adata.obs['dpt_group'].isin(not_okgroup), 'dpt_group'] = ''
    adata.obs['heat_diff'] = np.nan
    # for each group
    for val in set(okgroup):
        if val == '':
            continue
        # get the best root cell
        eq = adata.obs.dpt_group==val
        loc = np.where(eq)[0]

        root_ixs = loc[adata.obsm["X_diffmap"][eq, 0].argmin()]
        adata.uns["iroot"] = root_ixs
        # compute the diffusion pseudo time from it
        sc.tl.dpt(adata)
        adata.obs.loc[eq, 'heat_diff'] = adata.obs.loc[eq, 'dpt_pseudotime']
        adata.obs.drop(columns=['dpt_pseudotime'], inplace=True)

    #sort so that the next time points are aligned for all groups
    adata = adata[adata.obs.sort_values(['dpt_group','heat_diff']).index]
    #to query N next time points we just get the N elements below and check they are in the group
    # to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
    return adata


In [89]:
do_preprocess = Preprocessor(lb, additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess)

In [90]:
preprocessed_dataset = do_preprocess(mydataset, start_at=6)

❗ no run & transform get linked, consider passing a `run` or calling ln.track()
0
AnnData object with n_obs × n_vars = 644 × 16380
    obs: 'nCount_Spatial', 'nFeature_Spatial', 'Cluster', 'adult_pred_type', 'adult_pred_value', 'fetal_pred_type', 'fetal_pred_value', 'Cell Cycle', 'Type 3 ILCs', 'DCs', 'Macrophages', 'Lymphatic', 'Arterial (CP) 2', 'Venous (CP) 1', 'Venous (M) 2', 'Endothelium G2M-phase', 'Venous (CP) 2', 'Arterial (CP)', 'Arterial (M)', 'Endothelium S-phase', 'Proximal Progenitor', 'Proximal Mature Enterocytes', 'BEST4_OTOP2 Cells', 'Proximal TA', 'Proximal Early Enterocytes', 'Proximal Enterocytes', 'Proximal Stem Cells', 'EECs', 'Distal Enterocytes', 'Goblets', 'Distal TA', 'Distal Absorptive', 'Distal Stem Cells', 'Secretory Progenitors', 'Distal Mature Enterocytes', 'S1', 'S1 COL6A5+', 'Proximal S2 (2)', 'Distal S2', 'Fibroblasts S-phase', 'Proximal S2 (1)', 'S3 Progenitor', 'Fibroblasts G2M-phase', 'S3', 'Fibroblast Progenitor', 'Erythroid', 'S3 EBF+', 'S3 HAND1+'

  df[key] = c
... storing 'dpt_group' as categorical
  df[key] = c
... storing 'symbol' as categorical
  df[key] = c
... storing 'ncbi_gene_id' as categorical
  df[key] = c
... storing 'biotype' as categorical
  df[key] = c
... storing 'description' as categorical
  df[key] = c
... storing 'synonyms' as categorical


1
AnnData object with n_obs × n_vars = 40436 × 26779
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_BMI', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'tissue_section_uuid', 'tissue_section_thickness', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_predicted_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'predicted.celltype.score', 'array', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'default_embedding', 'schema_version', 'title'
    obsm: 'X_pca', 'X_ref.pca', 'X_ref.umap', 'X_spatial', 'X_umap'
Removed 65 genes.
Seeing 5069

  df[key] = c
... storing 'dpt_group' as categorical
  df[key] = c
... storing 'symbol' as categorical
  df[key] = c
... storing 'ncbi_gene_id' as categorical
  df[key] = c
... storing 'biotype' as categorical
  df[key] = c
... storing 'description' as categorical
  df[key] = c
... storing 'synonyms' as categorical


2
AnnData object with n_obs × n_vars = 17622 × 33178
    obs: 'nCount_RNA', 'nFeature_RNA', 'BroadLocation', 'Location', 'PCW', 'Genotype', 'Pool', 'Collection.ID', 'Cluster', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'assay_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_harmony', 'X_pca', 'X_umap'
Dataset dropped because contains too many secondary cells
3
AnnData object with n_obs × n_vars = 72335 × 36503
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'organism_ontology_term_id', 'sample_uuid',

  df[key] = c
... storing 'dpt_group' as categorical
  df[key] = c
... storing 'symbol' as categorical
  df[key] = c
... storing 'ncbi_gene_id' as categorical
  df[key] = c
... storing 'biotype' as categorical
  df[key] = c
... storing 'description' as categorical
  df[key] = c
... storing 'synonyms' as categorical


4
AnnData object with n_obs × n_vars = 19361 × 33178
    obs: 'Genes', 'Run', 'X', 'Y', 'Virus', 'Name', 'Treatment', 'Passch.Basal', 'Passch.Interm. basal.secr.', 'Passch.Secretory', 'Passch.Ciliated', 'Passch.FOXN4', 'Passch.BrushPNEC', 'Passch.Ionocytes', 'Passch.SLC16A7', 'CellType', 'Basal2', 'DividingCell', 'Name2', 'Cluster', 'Name3', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_umap'
Removed 94 genes.
❗ received 1 unique term, 19360 empty/duplicated terms are ignored
❗ [1;93m1 term[0m (100.

ValueError: Invalid sex ontology term id found

In [82]:
tdata = mydataset.files.all()[6].load()

❗ no run & transform get linked, consider passing a `run` or calling ln.track()


In [85]:
tdata.obs.assay_ontology_term_id.unique()

['EFO:0010961']
Categories (1, object): ['EFO:0010961']

In [None]:
ln.File.filter(version='2', description='preprocessed by scprint')

In [None]:
# TODO: finish the annotation things DONE
# TODO: finish the clustering thing DONE
# TODO: test on 20 datasets
# TODO: finish esm embedding
# TODO: make the dataloader for the model
# TODO: add gene length to the preprocessor if needed

In [207]:
labels_weighted_sampling = [
    'self_reported_ethnicity_ontology_term_id',
    'assay_ontology_term_id',
    'development_stage_ontology_term_id',
    'disease_ontology_term_id',
    'cell_type_ontology_term_id',
    'tissue_ontology_term_id',
    'sex_ontology_term_id',
]

In [208]:
mapped_dataset = mydataset.mapped(label_keys=labels_weighted_sampling)

❗ no run & transform get linked, consider passing a `run` or calling ln.track()


In [None]:
def get_label_weights(self, label_keys: Union[str, List[str]]):
    """Get all weights for a given label key."""
    if type(label_keys) is not list:
        label_keys = [label_keys]
    for i, val in enumerate(label_keys):
        if val not in self.label_keys:
            raise ValueError(f"{val} is not a valid label key.")
        if i == 0:
            labels = self.get_merged_labels(val)
        else:
            labels += "_"+ self.get_merged_labels(val)
    counter = Counter(labels)  # type: ignore
    weights = np.array([counter[label] for label in labels]) / len(labels)
    return weights