In [1]:
import os
import re
import csv
import torch
import random
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from scipy.stats import pearsonr
from anndata import AnnData

# for flex attention
import torch._dynamo
import torch.multiprocessing as mp 
torch._dynamo.config.suppress_errors = True

sc.set_figure_params(figsize=(4, 4))

from cellarium.ml.utilities.inference.cellarium_gpt_inference import \
    CellariumGPTInferenceContext

2025-04-30 12:37:53.144166: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_root = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/data/GSE153807/tsvs'

fnames = ['GSM4654467_Nuc-RM101-2.raw.tsv', 
          'GSM4654469_Nuc-RM102-1.raw.tsv', 'GSM4654468_Nuc-RM102-2.raw.tsv', 
          'GSM4654470_Nuc-RM77-1.raw.tsv', 'GSM4654471_Nuc-RM77-2.raw.tsv',
          'GSM4654472_Nuc-RM95-1.raw.tsv', 'GSM4654473_Nuc-RM95-2.raw.tsv']

sex_per_fname = ['Female', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female']
sex_ontology_type_id_per_fname = ['PATO:0000383', 'PATO:0000383', 'PATO:0000383', 'PATO:0000384', 'PATO:0000384', 'PATO:0000383', 'PATO:0000383']

gene_info_path = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/gene_info/gene_info.tsv'
ontology_infos_path = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/ontology_infos.pt'

idx_to_run = 0

In [80]:
gene_info_df = pd.read_csv(gene_info_path, sep='\t', index_col=0)

In [81]:
gene_info_df

Unnamed: 0_level_0,Gene Biotype,Gene Symbol,Gene Source,Gene Synonyms
ENSEMBL Gene ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000291190,lncRNA,A2MP1,NCBI gene (formerly Entrezgene),A2MP
ENSG00000291084,lncRNA,ALOX12P2,NCBI gene (formerly Entrezgene),ALOX12E
ENSG00000286112,protein_coding,KYAT1,NCBI gene (formerly Entrezgene),CCBL1
ENSG00000171097,protein_coding,KYAT1,HGNC Symbol,CCBL1
ENSG00000286112,protein_coding,KYAT1,NCBI gene (formerly Entrezgene),GTK
...,...,...,...,...
ENSG00000288626,protein_coding,,,
ENSG00000288629,protein_coding,,,
ENSG00000288678,protein_coding,,,
ENSG00000290825,lncRNA,DDX11L2,NCBI gene (formerly Entrezgene),


In [82]:
gene_symb_to_gene_id = gene_info_df['Gene Symbol'].reset_index().set_index('Gene Symbol')['ENSEMBL Gene ID'].to_dict()

In [None]:
ontology_infos = torch.load(ontology_infos_path)

# gene_symb_to_gene_id = {}
# # gene_synonym_to_gene_id = {}
# gene_symb_to_gene_syn = {}
# gene_id_to_symbol = {}
# with open(gene_info_path, 'r') as fp:
#     reader = csv.reader(fp, delimiter='\t')
#     next(reader)
#     for row in reader:
#         # merge gene names over symbol and synony,
#         gene_symb_to_gene_id[row[2]] = row[0]
#         # gene_symb_to_gene_id[row[4]] = row[0]
#         gene_symb_to_gene_syn[row[2]] = row[4]
#         # gene_synonym_to_gene_id[row[4]] = row[0]

#         # reverse map to the symbol in our ontology
#         gene_id_to_symbol[row[0]] = row[2]

  ontology_infos = torch.load(ontology_infos_path)


In [70]:
'SLIT2' in gene_symb_to_gene_id

True

In [83]:
df = pd.read_csv(os.path.join(data_root, fnames[idx_to_run]), sep='\t', index_col=0)

In [84]:
df

Unnamed: 0,Nuc-RM101-2_AAACCTGAGTCATCCA-1,Nuc-RM101-2_AAACCTGCAATACGCT-1,Nuc-RM101-2_AAACCTGCACGAGGTA-1,Nuc-RM101-2_AAACCTGCAGATGAGC-1,Nuc-RM101-2_AAACCTGCATAAGACA-1,Nuc-RM101-2_AAACCTGGTGTCCTCT-1,Nuc-RM101-2_AAACCTGGTGTGACGA-1,Nuc-RM101-2_AAACCTGTCAGTACGT-1,Nuc-RM101-2_AAACCTGTCGCCTGTT-1,Nuc-RM101-2_AAACCTGTCTGGGCCA-1,...,Nuc-RM101-2_TTTGTCAAGTTAGGTA-1,Nuc-RM101-2_TTTGTCACATCGGTTA-1,Nuc-RM101-2_TTTGTCACATTGGGCC-1,Nuc-RM101-2_TTTGTCAGTATGGTTC-1,Nuc-RM101-2_TTTGTCAGTCGACTAT-1,Nuc-RM101-2_TTTGTCAGTCGCTTCT-1,Nuc-RM101-2_TTTGTCAGTCTGGTCG-1,Nuc-RM101-2_TTTGTCATCACTATTC-1,Nuc-RM101-2_TTTGTCATCAGTACGT-1,Nuc-RM101-2_TTTGTCATCTCTTATG-1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
A2M-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
ZYG11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,1,1,1
ZYX,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
# ontology_genes = set(gene_symb_to_gene_id.keys()).union(set(gene_synonym_to_gene_id.keys()))
ontology_genes = set(gene_symb_to_gene_id.keys())
df = df[df.index.isin(ontology_genes)]

In [86]:
df

Unnamed: 0,Nuc-RM101-2_AAACCTGAGTCATCCA-1,Nuc-RM101-2_AAACCTGCAATACGCT-1,Nuc-RM101-2_AAACCTGCACGAGGTA-1,Nuc-RM101-2_AAACCTGCAGATGAGC-1,Nuc-RM101-2_AAACCTGCATAAGACA-1,Nuc-RM101-2_AAACCTGGTGTCCTCT-1,Nuc-RM101-2_AAACCTGGTGTGACGA-1,Nuc-RM101-2_AAACCTGTCAGTACGT-1,Nuc-RM101-2_AAACCTGTCGCCTGTT-1,Nuc-RM101-2_AAACCTGTCTGGGCCA-1,...,Nuc-RM101-2_TTTGTCAAGTTAGGTA-1,Nuc-RM101-2_TTTGTCACATCGGTTA-1,Nuc-RM101-2_TTTGTCACATTGGGCC-1,Nuc-RM101-2_TTTGTCAGTATGGTTC-1,Nuc-RM101-2_TTTGTCAGTCGACTAT-1,Nuc-RM101-2_TTTGTCAGTCGCTTCT-1,Nuc-RM101-2_TTTGTCAGTCTGGTCG-1,Nuc-RM101-2_TTTGTCATCACTATTC-1,Nuc-RM101-2_TTTGTCATCAGTACGT-1,Nuc-RM101-2_TTTGTCATCTCTTATG-1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
A2M-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
ZYG11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,1,1,1
ZYX,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
original_symbols = df.index.to_series(name='gene_symbol')
mapped_ids = original_symbols.map(lambda s: gene_symb_to_gene_id.get(s))
df.index = mapped_ids

data = {
    'suspension_type': ['nucleus'] * len(df.columns),
    'total_mrna_umis': df.sum(axis=0),
    'assay_ontology_term_id': ['EFO:0009899'] * len(df.columns),
    'assay': ["10x 3' v2"] * len(df.columns),
    'sex': [sex_per_fname[idx_to_run]] * len(df.columns),
    'sex_ontology_term_id': [sex_ontology_type_id_per_fname[idx_to_run]] * len(df.columns)
}

obs = pd.DataFrame(index=df.columns, data=data)
var = pd.DataFrame(index=df.index)        # one row per gene ID
var['gene_symbol'] = original_symbols     # store the original symbol

adata = AnnData(X=df.values.T, obs=obs, var=var)

In [77]:
print(adata)
vals, counts = np.unique(adata.var_names, return_counts=True)

AnnData object with n_obs × n_vars = 5491 × 16663
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex', 'sex_ontology_term_id'
    var: 'gene_symbol'


In [78]:
counts.max()

1

In [48]:
gene_id_to_symbol['ENSG00000184347']

'SLIT3'

In [49]:
gene_symb_to_gene_syn['SLIT3']

'SLIT2'

In [55]:
df[df.index.isin(['SLIT3', 'SLIT2'])]

Unnamed: 0,Nuc-RM101-2_AAACCTGAGTCATCCA-1,Nuc-RM101-2_AAACCTGCAATACGCT-1,Nuc-RM101-2_AAACCTGCACGAGGTA-1,Nuc-RM101-2_AAACCTGCAGATGAGC-1,Nuc-RM101-2_AAACCTGCATAAGACA-1,Nuc-RM101-2_AAACCTGGTGTCCTCT-1,Nuc-RM101-2_AAACCTGGTGTGACGA-1,Nuc-RM101-2_AAACCTGTCAGTACGT-1,Nuc-RM101-2_AAACCTGTCGCCTGTT-1,Nuc-RM101-2_AAACCTGTCTGGGCCA-1,...,Nuc-RM101-2_TTTGTCAAGTTAGGTA-1,Nuc-RM101-2_TTTGTCACATCGGTTA-1,Nuc-RM101-2_TTTGTCACATTGGGCC-1,Nuc-RM101-2_TTTGTCAGTATGGTTC-1,Nuc-RM101-2_TTTGTCAGTCGACTAT-1,Nuc-RM101-2_TTTGTCAGTCGCTTCT-1,Nuc-RM101-2_TTTGTCAGTCTGGTCG-1,Nuc-RM101-2_TTTGTCATCACTATTC-1,Nuc-RM101-2_TTTGTCATCAGTACGT-1,Nuc-RM101-2_TTTGTCATCTCTTATG-1
SLIT2,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,9,0,2,0,0
SLIT3,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,1,0,4,0,0


In [None]:
ROOT_PATH = "/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm"

REF_ADATA_FP = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/data/extract_0.h5ad'
OUT_ADATA_DIR = '/work/hdd/bbjr/mallina1/data/human_cellariumgpt_v2/suspension_type_conversion'

GENE_INFO_PATH = os.path.join(ROOT_PATH, "gene_info", "gene_info.tsv")

# CHECKPOINT_PATH = "/work/hdd/bbjr/mallina1/cellarium/models/compute_optimal_checkpoints/epoch=1-step=28244.ckpt"
# CHECKPOINT_PATH = "/work/hdd/bbjr/mallina1/cellarium/models/compute_optimal_checkpoints/epoch=6-step=63560.ckpt"
CHECKPOINT_PATH = "/work/hdd/bbjr/mallina1/cellarium/models/compute_optimal_checkpoints/epoch=10-step=78917.ckpt"

DEVICE = 'cuda'

ref_adata = sc.read_h5ad(REF_ADATA_FP)
ref_var_names = set(ref_adata.var_names)

In [25]:
# ref_var_names

In [87]:
genes_to_keep = ["AC004448.2","AC010894.3","AC011468.3","AC011586.2","AC016708.1","AC022217.3","AC024230.1",
                 "AC044781.1","AC072062.1","AC245014.3","ACTB","AIF1","AL136454.1","ALOX5AP","AMBRA1","APOC1",
                 "APOE","APOO","ARMC9","ATP5F1E","ATP5MC2","ATP6V0B","ATP6V0E1","B2M","BAIAP2L1","BDNF-AS",
                 "BTF3","BTG2","C1QA","C1QB","C1QC","CARMIL1","CCDC200","CCL2","CCL3","CCL3L1","CCL4","CCL4L2",
                 "CD14","CD37","CD63","CD68","CD74","CEBPB","CEBPD","CFL1","CHCHD3","COMMD6","CORO1A","COX4I1",
                 "CST3","CTSB","CYBA","DAPK1","DDIT4","DNAJB1","DUSP1","EEF1A1","EEF1B2","EEF1D","EEF2","EFCAB3",
                 "EIF1","FAU","FCER1G","FCGRT","FOLR2","FOS","FP700111.1","FTH1","FTL","GADD45B","GGACT",
                 "GPR183","GPX4","GRN","GSTP1","H3F3B","HCST","HERPUD1","HLA-A","HLA-B","HLA-C","HLA-DPA1",
                 "HLA-DPB1","HLA-DRA","HLA-DRB1","HLA-DRB5","HLA-E","HMOX1","HNRNPA1","HSP90AA1","HSPA1A",
                 "HSPA1B","HSPB1","IER2","IER3","ITM2B","JUN","JUNB","KIZ-AS1","LAMTOR4","LAPTM4A","LAPTM5",
                 "LINC01500","LINC01736","LINGO1","LTC4S","MAMDC2","MARCKS","MECOM","MT-ATP6","MT-CO1","MT-CO2",
                 "MT-CO3","MT-CYB","MT-ND2","MT-ND3","MT-ND4","MYL6","NACA","NACA2","NBEAL1","NFKBIA","NHSL2",
                 "NINJ1","NPC2","OLFML3","OOEP","OTULINL","PDK4","PFDN5","PFN1","PLD4","PLEKHA6","PLEKHA7",
                 "PNRC1","PSAP","PTMA","PYCARD","RAC1","RACK1","RGS1","RGS10","RHOB","RHOG","RNASE6","RPL10",
                 "RPL10A","RPL11","RPL12","RPL13","RPL13A","RPL14","RPL15","RPL18","RPL18A","RPL19","RPL21",
                 "RPL23","RPL23A","RPL24","RPL27","RPL27A","RPL28","RPL29","RPL3","RPL30","RPL31","RPL32","RPL34",
                 "RPL35","RPL35A","RPL36","RPL36AL","RPL37","RPL37A","RPL38","RPL39","RPL4","RPL41","RPL5","RPL6",
                 "RPL7","RPL7A","RPL8","RPLP0","RPLP1","RPLP2","RPS11","RPS12","RPS13","RPS14","RPS15","RPS15A",
                 "RPS16","RPS17","RPS18","RPS19","RPS2","RPS20","RPS23","RPS24","RPS25","RPS26","RPS27","RPS27A",
                 "RPS28","RPS29","RPS3","RPS3A","RPS4X","RPS5","RPS6","RPS7","RPS8","RPS9","RPSA","S100A11","SAT1",
                 "SERF2","SIK3","SLC25A6","SLC27A4","SLC47A1","SPP1","SRGN","TEX14","TMSB10","TMSB4X","TOMM7","TPT1",
                 "TREM2","TSPO","TUBA1B","TXNRD1","TYROBP","UBA52","UBC","VSIR","XPO5","YBX1","ZFP36","ZFP36L1",
                 "ZFP36L2","ZNF90"]

print(len(genes_to_keep))

gene_ids_to_keep = []
for x in genes_to_keep:
    if x in gene_symb_to_gene_id:
        if gene_symb_to_gene_id[x] in ref_var_names:
            gene_ids_to_keep.append(gene_symb_to_gene_id[x])
    # elif x in gene_synonym_to_gene_id:
    #     if gene_synonym_to_gene_id[x] in ref_var_names:
    #         gene_ids_to_keep.append(gene_synonym_to_gene_id[x])

print(len(gene_ids_to_keep))

n_fixed_query_genes = 4096 - len(gene_ids_to_keep)

246
231


In [27]:
n_fixed_query_genes

3864

In [28]:
adata

AnnData object with n_obs × n_vars = 5491 × 17135
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex', 'sex_ontology_term_id'
    var: 'gene_symbol'

In [29]:
'ENSG00000132475' in gene_ids_to_keep

True

In [31]:
_adata = adata[:, adata.var_names.isin(ref_var_names)].copy()

  utils.warn_names_duplicates("var")


In [32]:
_adata

AnnData object with n_obs × n_vars = 5491 × 16958
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex', 'sex_ontology_term_id'
    var: 'gene_symbol'

In [15]:
# _adata = adata.copy()
# _metacell = metacell.copy()
# _adata = adata[:, adata.var_names.isin(list(ref_var_names) + gene_ids_to_keep)].copy()
# _metacell = metacell[:, metacell.var_names.isin(list(ref_var_names) + gene_ids_to_keep)].copy()

# var_names = np.array(list(_adata.var_names))
# _adata = _adata[:, var_names]

In [16]:
sc.pp.highly_variable_genes(_adata, flavor='seurat_v3', n_top_genes=n_fixed_query_genes)
_adata.var['highly_variable']


gene_symbol
ENSG00000121410    False
ENSG00000268895    False
ENSG00000148584     True
ENSG00000175899     True
ENSG00000245105    False
                   ...  
ENSG00000070476    False
ENSG00000203995    False
ENSG00000162378    False
ENSG00000159840     True
ENSG00000074755    False
Name: highly_variable, Length: 16513, dtype: bool

In [17]:
temp_subset = _adata[:, _adata.var['highly_variable']].copy()
print(temp_subset)

final_gene_list = list(set(gene_ids_to_keep + list(temp_subset.var_names)))
print(final_gene_list)
print(len(final_gene_list))

# subset_metacell_adata = _metacell[:, _metacell.var_names.isin(final_gene_list)].copy()
# subset_adata = _adata[:, _adata.var_names.isin(final_gene_list)].copy()

# subset_metacell_adata = _metacell[:, np.array(final_gene_list)].copy()
# subset_adata = _adata[:, final_gene_list].copy()

AnnData object with n_obs × n_vars = 5491 × 3864
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex', 'sex_ontology_term_id'
    var: 'gene_symbol', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'
['ENSG00000157214', 'ENSG00000106511', 'ENSG00000112367', 'ENSG00000112414', 'ENSG00000006451', 'ENSG00000103056', 'ENSG00000164924', 'ENSG00000198886', 'ENSG00000077420', 'ENSG00000173706', 'ENSG00000110079', 'ENSG00000105088', 'ENSG00000089558', 'ENSG00000188610', 'ENSG00000165685', 'ENSG00000272168', 'ENSG00000168418', 'ENSG00000153253', 'ENSG00000277149', 'ENSG00000164619', 'ENSG00000164116', 'ENSG00000132530', 'ENSG00000063127', 'ENSG00000079691', 'ENSG00000136928', 'ENSG00000162367', 'ENSG00000171603', 'ENSG00000101680', 'ENSG00000104888', 'ENSG00000151577', 'ENSG00000163285', 'ENSG00000157060', 'ENSG00000077279', 'ENSG00000197977', 'ENSG00000196735', 'ENSG00000167100', 'ENSG00000239205', 'ENSG000001641

In [18]:
subset_adata = _adata[:, _adata.var_names.isin(final_gene_list)].copy()

In [19]:
ctx = CellariumGPTInferenceContext(
    cellarium_gpt_ckpt_path=CHECKPOINT_PATH,
    ref_adata_path=REF_ADATA_FP,
    gene_info_tsv_path=GENE_INFO_PATH,
    device=DEVICE,
    attention_backend="mem_efficient"
)

In [20]:
subset_adata.obs['cell_type_ontology_term_id'] = None
subset_adata.obs['tissue_ontology_term_id'] = None
subset_adata.obs['disease_ontology_term_id'] = None
# subset_adata.obs['sex_ontology_term_id'] = None
subset_adata.obs['development_stage_ontology_term_id'] = None

In [21]:
metadata_prompt_dict = {
    "cell_type": False,
    "tissue": False,
    "disease": False,
    "sex": True,
    "development_stage": False
}

In [22]:
batch_size = 32

## run metacell first
query_genes = list(subset_adata.var_names)
pbar = tqdm(total=subset_adata.shape[0])

samples = []

for val_obs_idx in range(0, subset_adata.shape[0], batch_size):
    obs_idx = np.arange(val_obs_idx, min(val_obs_idx + batch_size, subset_adata.shape[0]))

    tokens_dict, context_indices = ctx.generate_tokens_from_adata(subset_adata, 
                                                                    obs_index=obs_idx, 
                                                                    query_var_names=query_genes,
                                                                    metadata_prompt_masks_dict=metadata_prompt_dict,
                                                                    query_total_mrna_umis=4900,
                                                                    query_suspension_type='cell')

    with torch.no_grad():
        gene_logits_nqk = ctx.get_gene_value_logits_from_tokens(tokens_dict,
                                                                context_indices,
                                                                max_counts=None)

        gene_marginal_mean_nq, _ = ctx.calculate_gene_mean_std_from_logits(gene_logits_nqk,
                                                                            gene_logits_nqk.shape[-1],
                                                                            use_logsumexp=True)

        dist = torch.distributions.categorical.Categorical(logits = gene_logits_nqk)
        sampled_counts = dist.sample().cpu()

        samples.append(sampled_counts)

        pbar.update(len(obs_idx))

  0%|                                                                                                                      | 0/5491 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 5491/5491 [32:00<00:00,  2.93it/s]

In [30]:
sampled_X = torch.cat(samples, dim=0)
sampled_X.shape

torch.Size([5491, 4006])

In [32]:
new_X = np.vstack([subset_adata.X, sampled_X.numpy()])
new_obs = subset_adata.obs.copy()
new_obs.suspension_type = 'cell'
new_obs.total_mrna_umis = 4900

new_obs = pd.concat([subset_adata.obs, new_obs], axis=0)

output_adata = AnnData(
    X = new_X,
    obs = new_obs,
    var = subset_adata.var.copy()
)

  utils.warn_names_duplicates("obs")


In [37]:
output_adata.obs.suspension_type = output_adata.obs.suspension_type.astype('category')
output_adata

AnnData object with n_obs × n_vars = 10982 × 4006
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex', 'sex_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'disease_ontology_term_id', 'development_stage_ontology_term_id'
    var: 'gene_symbol', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'

In [None]:
# del output_adata.obs['cell_type_ontology_term_id']
# del output_adata.obs['tissue_ontology_term_id']
# del output_adata.obs['disease_ontology_term_id']
# del output_adata.obs['sex_ontology_term_id']
# del output_adata.obs['development_stage_ontology_term_id']
# del output_adata.gene_symbol

# sc.write('/work/hdd/bbjr/mallina1/data/human_cellariumgpt_v2/suspension_type_conversion/GSM4654467_Nuc-RM101-2_converted.h5ad', output_adata)

AttributeError: gene_symbol

In [None]:
# np.save('/work/hdd/bbjr/mallina1/data/human_cellariumgpt_v2/suspension_type_conversion/GSM4654467_Nuc-RM101-2_sampled.npy', sampled_X.numpy())

In [44]:
# Normalizing to median total counts
sc.pp.normalize_total(output_adata)
# Logarithmize the data
sc.pp.log1p(output_adata)
sc.tl.rank_genes_groups(output_adata, groupby="suspension_type", method="wilcoxon")

In [47]:
output_adata

AnnData object with n_obs × n_vars = 10982 × 4006
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex'
    var: 'gene_symbol', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'rank_genes_groups', 'log1p'

In [None]:
df = sc.get.rank_genes_groups_df(output_adata, group=None)
df

Unnamed: 0,group,names,scores,logfoldchanges,pvals,pvals_adj
0,cell,ENSG00000205542,48.479607,2.423626,0.0,0.0
1,cell,ENSG00000187514,47.908340,2.481303,0.0,0.0
2,cell,ENSG00000075624,47.771297,2.146843,0.0,0.0
3,cell,ENSG00000105372,39.881428,2.681514,0.0,0.0
4,cell,ENSG00000140264,38.619816,2.549507,0.0,0.0
...,...,...,...,...,...,...
8007,nucleus,ENSG00000140264,-38.619816,-2.549507,0.0,0.0
8008,nucleus,ENSG00000105372,-39.881428,-2.681514,0.0,0.0
8009,nucleus,ENSG00000075624,-47.771297,-2.146843,0.0,0.0
8010,nucleus,ENSG00000187514,-47.908340,-2.481303,0.0,0.0


In [52]:
df.to_csv('/work/hdd/bbjr/mallina1/data/human_cellariumgpt_v2/suspension_type_conversion/GSM4654467_Nuc-RM101-2_ranked.csv', index=False)