In [1]:
import scanpy as sc
import pandas as pd
import os
import numpy as np

In [2]:
# error settings to avoid crowding the notebook
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)

Load definitions

In [3]:
def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=grouped.groups.keys(),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

def marker_gene_expression(anndata, marker_dict, gene_symbol_key=None, partition_key='louvain_r1'):
    """
    A function to get mean z-score expressions of marker genes
    # 
    # Inputs:
    #    anndata         - An AnnData object containing the data set and a partition
    #    marker_dict     - A dictionary with cell-type markers. The markers should be stores as anndata.var_names or 
    #                      an anndata.var field with the key given by the gene_symbol_key input
    #    gene_symbol_key - The key for the anndata.var field with gene IDs or names that correspond to the marker 
    #                      genes
    #    partition_key   - The key for the anndata.obs field where the cluster IDs are stored. The default is
    #                      'louvain_r1' 
    """

    #Test inputs
    if partition_key not in anndata.obs.columns.values:
        print('KeyError: The partition key was not found in the passed AnnData object.')
        print('   Have you done the clustering? If so, please tell pass the cluster IDs with the AnnData object!')
        raise

    if (gene_symbol_key != None) and (gene_symbol_key not in anndata.var.columns.values):
        print('KeyError: The provided gene symbol key was not found in the passed AnnData object.')
        print('   Check that your cell type markers are given in a format that your anndata object knows!')
        raise
        
    if gene_symbol_key:
        gene_ids = anndata.var[gene_symbol_key]
    else:
        gene_ids = anndata.var_names

    clusters = anndata.obs[partition_key].cat.categories
    n_clust = len(clusters)
    marker_exp = pd.DataFrame(columns=clusters)
    marker_exp['cell_type'] = pd.Series({}, dtype='str')
    marker_names = []
    
    z_scores = sc.pp.scale(anndata, copy=True)

    i = 0
    for group in marker_dict:
        # Find the corresponding columns and get their mean expression in the cluster
        for gene in marker_dict[group]:
            ens_idx = np.in1d(gene_ids, gene) #Note there may be multiple mappings
            if np.sum(ens_idx) == 0:
                continue
            else:
                z_scores.obs[ens_idx[0]] = z_scores.X[:,ens_idx].mean(1) #works for both single and multiple mapping
                ens_idx = ens_idx[0]

            clust_marker_exp = z_scores.obs.groupby(partition_key)[ens_idx].apply(np.mean).tolist()
            clust_marker_exp.append(group)
            marker_exp.loc[i] = clust_marker_exp
            marker_names.append(gene)
            i+=1

    #Replace the rownames with informative gene symbols
    marker_exp.index = marker_names

    return(marker_exp)


### Read  scRNA-seq data

In [4]:
#anndata_all = sc.read_h5ad('mli_raw_annotated.h5ad')
anndata_all = sc.read_h5ad('/home/sanjavickovic/data/magic_genes/magic_subset_sig_all_genes.h5ad')

OSError: Unable to open file (unable to open file: name = '/home/sanjavickovic/data/magic_genes/magic_subset_sig_all_genes.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [7]:
anndata_all.var_names

Index(['Mrpl15', 'Lypla1', 'Tcea1', 'Atp6v1h', 'Rb1cc1', 'Vcpip1', 'Snhg6',
       'Cops5', 'Cspp1', 'Arfgef1',
       ...
       'Hccs', 'mt-Nd1', 'mt-Nd2', 'mt-Co1', 'mt-Nd4', 'mt-Nd5', 'mt-Nd6',
       'mt-Cytb', 'Vamp7', 'Tmlhe'],
      dtype='object', length=6825)

In [54]:
anndata_all.var_names_make_unique()

In [8]:
# copy of object
adata = anndata_all

## Make grouped means

In [9]:
adata_zscore_norm = grouped_obs_mean(adata, 'pheno_cell_types')

In [11]:
adata_zscore_norm.to_csv('/home/sanjavickovic/data/sc_mli_scores/mli_zscore_norm_all_pheno_log_magic.csv')

In [12]:
sc.pp.scale(adata, max_value=10)

In [13]:
adata_zscore_norm_scaled = grouped_obs_mean(adata, 'pheno_cell_types')

In [61]:
adata_zscore_norm_scaled.to_csv('/home/sanjavickovic/data/sc_mli_scores/mli_zscore_norm_scaled_all_pheno_log_magic.csv')

In [62]:
adata_zscore_norm_scaled_age = grouped_obs_mean(adata, ['pheno_cell_types', 'age',])

In [63]:
adata_zscore_norm_scaled_age.to_csv('/home/sanjavickovic/data/sc_mli_scores/mli_zscore_norm_scaled_all_pheno_age_log_magic.csv')

In [29]:
rank_genes_groups_keys = ['logfoldchanges', 'pvals_adj', 'scores']
rank_genes_groups = adata.uns['pheno_minor_cell_types']
group_names = rank_genes_groups['names'].dtype.names

group_df_final = list()
for group_name in group_names:
    gene_names = rank_genes_groups['names'][group_name]
    group_df = pd.DataFrame(index=gene_names)
    for rank_genes_groups_key in rank_genes_groups_keys:
        values = rank_genes_groups[rank_genes_groups_key][group_name]
        group_df[rank_genes_groups_key] = values
    group_df['cell_type'] = group_name
    group_df_final.append(group_df)


In [30]:
group_df_final = pd.concat(group_df_final)

In [31]:
group_df_final = group_df_final[(group_df_final['logfoldchanges'] > 2) & (group_df_final['pvals_adj'] < 0.01)]

In [32]:
markers_dict = dict()
for i in np.unique(group_df_final['cell_type']):
    markers_dict[i] = list(group_df_final[group_df_final['cell_type'] == i].index)

In [33]:
markers_dict

{'B_cell': ['S100a6',
  'Gsdmc4',
  'Actg1',
  'Ftl1',
  'Ckmt1',
  'Slc25a5',
  'Zcchc12',
  'Kif2c',
  'Zdhhc14',
  'Cyp2c69',
  'Ngfr',
  'Abca13',
  'Cplx2',
  'Agt',
  'Ly6g6d',
  'Bcam',
  'Krt23',
  'Man1c1',
  'Ms4a4d',
  'Pygl',
  'Aoc3',
  'Camk2a',
  'Crispld1',
  '4930467D21Rik',
  'A330102I10Rik',
  'Ar',
  'Vwf',
  'Rasip1',
  'Mettl21b',
  '4930552P12Rik',
  'Rep15',
  'Ndrg4',
  '2310079G19Rik',
  'Gcg',
  'Shc4',
  'Sorcs2',
  'Bmp6',
  'Lrrc26',
  'Gng4',
  'Kif5a',
  'Sv2a',
  '5330434G04Rik',
  'Resp18',
  'Atp6v0a4',
  'Fam212b',
  'Aldh1a3',
  'Spon2',
  'Cnr1',
  'Cdh5',
  'Nuak1',
  'Hap1',
  'Wnt8b',
  'Scn9a',
  'Rarres2',
  'Ms4a7',
  'Add2',
  'Mfap5',
  'Adm',
  'Vill',
  'Chrdl1',
  'Cadm3',
  'Snap91',
  'Htra3',
  'Enpp2',
  'Syt4',
  'Kcnj3',
  'Cdkn1c',
  'Tnr',
  'Gjb5',
  'C130071C03Rik',
  'Amn',
  'Medag',
  'Kcnq3',
  'Upk1b',
  'Gm12381',
  'Iigp1',
  'Muc16',
  'Itga11',
  'Tspan6',
  'Trpm5',
  'Dnah5',
  'Elavl3',
  'Slc7a14',
  'Esam',
  'Slc

In [64]:
#marker_gene_expression(adata, markers_dict, partition_key='pheno_cell_types')