In [1]:
import scanpy as sc
import pandas as pd
import os
import numpy as np

In [2]:
# error settings to avoid crowding the notebook
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)

Load definitions

In [3]:
def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=grouped.groups.keys(),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

def marker_gene_expression(anndata, marker_dict, gene_symbol_key=None, partition_key='louvain_r1'):
    """
    A function to get mean z-score expressions of marker genes
    # 
    # Inputs:
    #    anndata         - An AnnData object containing the data set and a partition
    #    marker_dict     - A dictionary with cell-type markers. The markers should be stores as anndata.var_names or 
    #                      an anndata.var field with the key given by the gene_symbol_key input
    #    gene_symbol_key - The key for the anndata.var field with gene IDs or names that correspond to the marker 
    #                      genes
    #    partition_key   - The key for the anndata.obs field where the cluster IDs are stored. The default is
    #                      'louvain_r1' 
    """

    #Test inputs
    if partition_key not in anndata.obs.columns.values:
        print('KeyError: The partition key was not found in the passed AnnData object.')
        print('   Have you done the clustering? If so, please tell pass the cluster IDs with the AnnData object!')
        raise

    if (gene_symbol_key != None) and (gene_symbol_key not in anndata.var.columns.values):
        print('KeyError: The provided gene symbol key was not found in the passed AnnData object.')
        print('   Check that your cell type markers are given in a format that your anndata object knows!')
        raise
        
    if gene_symbol_key:
        gene_ids = anndata.var[gene_symbol_key]
    else:
        gene_ids = anndata.var_names

    clusters = anndata.obs[partition_key].cat.categories
    n_clust = len(clusters)
    marker_exp = pd.DataFrame(columns=clusters)
    marker_exp['cell_type'] = pd.Series({}, dtype='str')
    marker_names = []
    
    z_scores = sc.pp.scale(anndata, copy=True)

    i = 0
    for group in marker_dict:
        # Find the corresponding columns and get their mean expression in the cluster
        for gene in marker_dict[group]:
            ens_idx = np.in1d(gene_ids, gene) #Note there may be multiple mappings
            if np.sum(ens_idx) == 0:
                continue
            else:
                z_scores.obs[ens_idx[0]] = z_scores.X[:,ens_idx].mean(1) #works for both single and multiple mapping
                ens_idx = ens_idx[0]

            clust_marker_exp = z_scores.obs.groupby(partition_key)[ens_idx].apply(np.mean).tolist()
            clust_marker_exp.append(group)
            marker_exp.loc[i] = clust_marker_exp
            marker_names.append(gene)
            i+=1

    #Replace the rownames with informative gene symbols
    marker_exp.index = marker_names

    return(marker_exp)


### Read  scRNA-seq data

In [4]:
#anndata_all = sc.read_h5ad('mli_raw_annotated.h5ad')
anndata_all = sc.read_h5ad('/home/sanjavickovic/data/host-microbiome_data/sc_anndatas/adata_larger_relabeling_after_tsne.h5ad')

In [5]:
anndata_all

AnnData object with n_obs × n_vars = 419334 × 2091
    obs: 'age', 'Unique_ID', 'Mouse_ID', 'Sample_ID', '10x_version', 'Region', 'Sex', 'batch', 'doublet_scores', 'predicted_doublets', 'leiden', 'cell_type', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_Gm', 'log1p_total_counts_Gm', 'pct_counts_Gm', 'S_score', 'G2M_score', 'phase', 'leiden_cell_types', 'leiden_major_cell_types', 'pheno', 'pheno_cell_types', 'pheno_major_cell_types'
    var: 'gene_ids-0', 'feature_types-0', 'n_cells', 'Gm', 'ribo', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mean', 'std'
    uns: 'age_colors', 'cell_type_colors', "dendrogram_['leiden_cell_types']", "dendrogram_['leiden_major_cell_types']", "dendrogram_['pheno_cell_types']", "dendro

In [6]:
anndata_all.var_names_make_unique()

## Make grouped means (markers only)

In [8]:
adata_zscore_norm = grouped_obs_mean(anndata_all, 'pheno_cell_types')

In [9]:
adata_zscore_norm.to_csv('/home/sanjavickovic/data/host-microbiome_data/sc_mli_scores/mli_zscore_norm_scaled_pheno_log_markers.csv')

In [10]:
## Take .raw and repeat for all genes

In [7]:
# copy of object
adata = anndata_all.raw.to_adata()

## Make grouped means

In [8]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
adata_zscore_norm = grouped_obs_mean(adata, 'pheno_cell_types')

In [None]:
adata_zscore_norm.to_csv('/home/sanjavickovic/data/host-microbiome_data/sc_mli_scores/mli_zscore_norm_all_pheno_log.csv')

In [None]:
sc.pp.scale(adata, max_value=10)

In [None]:
adata_zscore_norm_scaled = grouped_obs_mean(adata, 'pheno_cell_types')

In [None]:
adata_zscore_norm_scaled.to_csv('/home/sanjavickovic/data/host-microbiome_data/sc_mli_scores/mli_zscore_norm_scaled_all_pheno_log.csv')

In [None]:
adata_zscore_norm_scaled_age = grouped_obs_mean(adata, ['pheno_cell_types', 'age',])

In [None]:
adata_zscore_norm_scaled_age.to_csv('/home/sanjavickovic/data/host-microbiome_data/sc_mli_scores/mli_zscore_norm_scaled_all_pheno_age_log.csv')