In [1]:
import os, glob
import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
import anndata as ad
from scipy.io import mmread
import matplotlib.pyplot as plt

In [None]:
def load_adata(data_dir):
    # expression mtx file
    mtx_fn_list = sorted(glob.glob(os.path.join(data_dir, '*.mtx')))
    if not mtx_fn_list:
        raise FileNotFoundError('No .mtx file found in the folder.')
    elif len(mtx_fn_list) > 1:
        raise ValueError('Multiple .mtx files found in the folder.')
    mtx_fn = mtx_fn_list[0]

    # gene file
    gene_fn_list = sorted(f for f in os.listdir(data_dir) if 'genes' in f.lower())
    if not gene_fn_list:
        raise FileNotFoundError("No file containing 'genes' found in the folder.")
    gene_fn = os.path.join(data_dir, gene_fn_list[0])

    # cell/metadata file
    cell_fn_list = sorted(f for f in os.listdir(data_dir) if 'cells' in f.lower())
    if not cell_fn_list:
        raise FileNotFoundError("No file containing 'cells' found in the folder.")
    cell_fn = os.path.join(data_dir, cell_fn_list[0])

    # read files -> return adata
    X = mmread(mtx_fn).tocsr().T
    genes = pd.read_csv(gene_fn, header = None)
    genes.columns = ['genes']
    genes.set_index('genes', inplace = True)
    obs_df = pd.read_csv(cell_fn, header = 0)
    obs_df.set_index(obs_df.columns[0], inplace = True)
    return ad.AnnData(X = X, obs = obs_df, var = genes)


In [4]:
datadir = os.path.join('..', '..', 'data', 'unzip', 'CCCA_Neuroendocrine', 'Data_Dong2020_Neuroendocrine')
adata1 = load_adata(os.path.join(datadir, 'Group1'))
adata2 = load_adata(os.path.join(datadir, 'Group2'))
adata = ad.concat((adata1, adata2))

In [5]:
adata

AnnData object with n_obs × n_vars = 55190 × 12113
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment'

In [None]:
adata.obs.rename(columns = {'cell_type' : 'celltype'}, inplace = True)
adata.obs['source'] = 'Dong2020'
adata.obs['weight'] = 1.

In [6]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [10]:
outdir = os.path.join('..', '..', 'data', 'processed')
adata.write(os.path.join(outdir, 'CCCA_Neuroendocrine.h5ad'))