In [1]:
import os
import csv
import scanpy as sc
import pandas as pd
import torch
from anndata import AnnData

In [2]:
data_root = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/data/GSE153807/tsvs'

fnames = ['GSM4654467_Nuc-RM101-2.raw.tsv', 
          'GSM4654469_Nuc-RM102-1.raw.tsv', 'GSM4654468_Nuc-RM102-2.raw.tsv', 
          'GSM4654470_Nuc-RM77-1.raw.tsv', 'GSM4654471_Nuc-RM77-2.raw.tsv',
          'GSM4654472_Nuc-RM95-1.raw.tsv', 'GSM4654473_Nuc-RM95-2.raw.tsv']

sex_per_fname = ['Female', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female']
sex_ontology_type_id_per_fname = ['PATO:0000383', 'PATO:0000383', 'PATO:0000383', 'PATO:0000384', 'PATO:0000384', 'PATO:0000383', 'PATO:0000383']

gene_info_path = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/gene_info/gene_info.tsv'
ontology_infos_path = '/work/hdd/bbjr/mallina1/data/mb-ml-dev-vm/ontology_infos.pt'

idx_to_run = 0

In [3]:
ontology_infos = torch.load(ontology_infos_path)

gene_symb_to_gene_id = {}
gene_synonym_to_gene_id = {}
with open(gene_info_path, 'r') as fp:
    reader = csv.reader(fp, delimiter='\t')
    next(reader)
    for row in reader:
        gene_symb_to_gene_id[row[2]] = row[0]
        gene_synonym_to_gene_id[row[4]] = row[0]

  ontology_infos = torch.load(ontology_infos_path)


In [4]:
df = pd.read_csv(os.path.join(data_root, fnames[idx_to_run]), sep='\t', index_col=0)

In [10]:
original_symbols = df.index.to_series(name='gene_symbol')
mapped_ids = original_symbols.map(lambda s: gene_symb_to_gene_id.get(s, s))
df.index = mapped_ids

data = {
    'suspension_type': ['nucleus'] * len(df.columns),
    'total_mrna_umis': df.sum(axis=0),
    'assay_ontology_term_id': ['EFO:0009899'] * len(df.columns),
    'assay': ["10x 3' v2"] * len(df.columns),
    'sex': [sex_per_fname[idx_to_run]] * len(df.columns),
    'sex_ontology_term_id': [sex_ontology_type_id_per_fname[idx_to_run]] * len(df.columns)
}

obs = pd.DataFrame(index=df.columns, data=data)
var = pd.DataFrame(index=df.index)        # one row per gene ID
var['gene_symbol'] = original_symbols     # store the original symbol

adata = AnnData(X=df.values.T, obs=obs, var=var)

In [12]:
adata

AnnData object with n_obs × n_vars = 5491 × 21283
    obs: 'suspension_type', 'total_mrna_umis', 'assay_ontology_term_id', 'assay', 'sex', 'sex_ontology_term_id'
    var: 'gene_symbol'

In [None]:
sc.tl.rank_genes_groups(adata, groupby="leiden_res_0.50", method="wilcoxon")

TypeError: rank_genes_groups() missing 1 required positional argument: 'groupby'

In [None]:
sc.get.rank_genes_groups_df(adata, group="7").head(5)