# Aggregate single cell data to run `cell2cell`

four sets single cell data:
Pellin et.al = human haematopoietic stem cells, FAC sorted CD34 or LIN
Wolock = mouse stroma cell
Tikhonova = mouse stroma cell, FAC sorted for special population

## Steps
1. for mouse data set, converting gene symbol to human gene symbol is needed
- not exactly one-to-one relationship
- lost information

2. We've showed it's `sc.combat()` function cannot mitigate batch effect for two sets of stroma data
- need to try something more sophisticated

3. aggregate cells using louvian cluster, take mean

4. generate metadata file

In [1]:
# settings
import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

sc.settings.set_figure_params(dpi=80)

scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.1 pandas==0.25.2 scikit-learn==0.21.3 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [2]:
from scSecretome.annot import *
from scSecretome.sc import *
import anndata

scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.1 pandas==0.25.2 scikit-learn==0.21.3 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [12]:
adata = sc.read('/home/hsher/processed_single_cell_data/pellin_bbknn.h5ad')
adata.obs.index

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


Index(['AAGTGAGA-TGTAAAGG-0', 'TCCGCTGT-GGAAACAG-0', 'AATATGAC-AGCGTAGG-0',
       'CGAAGAAG-ATTGGGCC-0', 'ACACCAAG-CCCGACTT-0', 'CGACATTT-GTACCTTG-0',
       'AGGACACA-TGACGGAC-0', 'TCTTTGAC-AAGCTTCT-0', 'GTGGTGCT-TCCCGTAG-0',
       'ACGGAGCA-ACACTAAG-0',
       ...
       'GAATCCCA-CCATTACT-10', 'AACGCCAA-ATGGATTA-10', 'GAGAAACC-TTTGGGAG-10',
       'CAAGGAAT-CTATAGAG-10', 'ATACACCC-AGTGAAAG-10', 'GCTATGGG-TCCAGTCC-10',
       'GCCTTGTG-TCTCAACC-10', 'AGGCCGAA-GGGAACGA-10', 'GCCTTGTG-AGCGAAGT-10',
       'GCCTTGTG-AGAACGGG-10'],
      dtype='object', name='index', length=21397)

In [29]:
np.mean(adata.raw[['AAGTGAGA-TGTAAAGG-0', 'TCCGCTGT-GGAAACAG-0']].X, axis = 0)

array([0.        , 0.        , 0.        , ..., 0.34649575, 0.        ,
       0.        ], dtype=float32)

In [31]:
def exp_aggregate(filename, use_raw = False, aggr_group = 'louvain'):
    '''
    use_raw when integrated dataset
    '''
    # read scanpy from processed single cell data
    base = '/home/hsher/processed_single_cell_data/'
    adata = sc.read(base + filename)
    
    # why there are duplicated barcode? becuase the cells are processed in parallel and then concatenated!
    adata.obs_names_make_unique(join='-')
    
    # group expression value by louvain cluster
    if use_raw:
        grouped_expression = pd.DataFrame(columns = adata.raw.var.index, index = adata.obs[aggr_group].unique())
    else:   
        grouped_expression = pd.DataFrame(columns = adata.var.index, index = adata.obs[aggr_group].unique())
    for g in adata.obs.groupby(by = [aggr_group]):
        if use_raw:
            grouped_expression.loc[g[0]] = np.mean(adata.raw[g[1].index.tolist()].X, axis = 0)
        else:

            
            grouped_expression.loc[g[0]] = np.mean(adata[g[1].index, :].X, axis = 0)
    return(grouped_expression, adata.obs)

In [4]:
def metadata(d, obs, species= 'human', aggr_group = 'louvain'):
    '''
    create louvian cluster annotation by using original publication's annotation or by lineage specific genes
    aggr_group: can be other clustering result in adata.obs
    '''
    # save metadata here
    mdata = pd.Series(index = d.index.tolist())
    
    # if cluster information, use cluster
    if 'cluster' in obs.columns:
        # count how many times each annotation occurs
        annot_count = obs.groupby(by = aggr_group)['cluster'].value_counts().unstack()
        annot_rank = annot_count.rank(axis = 1, ascending = False)
        
        
        
        # find the max two values (observe transitioning cells sometimes)
        for louvain_cluster in annot_rank.index:
            if 'nan' in annot_rank.columns:
                if annot_rank.loc[louvain_cluster, 'nan'] == 1:
                    continue
                    # if most are un-annotated, leave it that way
            else:
                # select the top two popular annotation
                best_annot = annot_rank.loc[louvain_cluster].loc[annot_rank.loc[louvain_cluster] == 1].index
                sec_annot = annot_rank.loc[louvain_cluster].loc[annot_rank.loc[louvain_cluster] == 2].index
                ratio = annot_count.loc[louvain_cluster, best_annot].values/annot_count.loc[louvain_cluster, sec_annot].values
                
                
                if len(sec_annot.tolist())  > 0:
                    name = '-'.join([best_annot.tolist()[0], sec_annot.tolist()[0], str(ratio)])
                else:
                    name = best_annot.tolist()[0]
                
                
                mdata.loc[louvain_cluster] = name
    
    # TRYING TO CALL LINEAGE FROM EXPRESSION DATA IS A PIECE OF SHIT
    #aggr = anndata.AnnData(d)
        
    # calling various annotation
    #lineage_calling(aggr, species = species)
        
    # it seems only those with high dispersion carry real biological signal
    #dispersion = aggr.obs.std()/aggr.obs.mean()
        
    # select those columns with dispersion > 0.99
    #selected = aggr.obs[dispersion.loc[dispersion > 0.99].index]
        
    # min-max scale the data
    #from sklearn import preprocessing
    #min_max_scaler = preprocessing.MinMaxScaler()
    #scaled_array = min_max_scaler.fit_transform(selected)
    #rk = pd.DataFrame(scaled_array, index = selected.index, columns = selected.columns).idxmax(axis = 1)
        
    # concat both annot
    #df = pd.concat([mdata, rk], axis = 1)
    #df.columns = ['cluster', 'marker_gene_assigned']
    return(mdata.to_frame())

In [5]:
def mice_to_human(d):
    '''
    take single cell matrix, convert mouse gene symbol to human
    '''
    id_converter = human_mouse_homolog()
    
    # drop non-unique mouse gene symbols
    id_converter.drop_duplicates(subset = ['Mouse'], inplace = True)
    id_converter.set_index('Mouse', inplace = True)
    
    # map gene names
    human_symbols = d.columns.to_series().map(id_converter['Human'], na_action = 'ignore') # don't propagate NaN
    human_symbols.dropna(inplace = True)
    
    #return(human_symbols)
    # subsetting gene names
    new_d = d[human_symbols.index]
    new_d.columns  = human_symbols
    return(new_d)

In [9]:
def wrapper(filename, species = 'human', output_fname = 'unspecified', aggr_group = 'louvain', use_raw = False):
    d, obs = exp_aggregate(filename, aggr_group = aggr_group, use_raw = use_raw)
    mdata = metadata(d, obs, species = species, aggr_group = aggr_group)
    
    if species == 'mouse':
        d = mice_to_human(d)
    
    # transpose to fit cell2cell input
    d = d.transpose()
    d.index.name = 'Gene Symbol'
    
    # save to folder
    base = '/home/hsher/c2c_input/'
    if output_fname == 'unspecified':
        fname = filename.split('.')[0]
    else:
        fname = output_fname
    d.to_excel(base + fname + '_matrix.xlsx')
    mdata.to_excel(base + fname + '_metadata.xlsx')

In [7]:
wrapper('wolock.h5ad', species = 'mouse')
wrapper('tikhonova.h5ad', species = 'mouse')
wrapper('pellin_cd34.h5ad', species = 'human')
wrapper('pellin_lin.h5ad', species = 'human')

IndexError: list index out of range

In [None]:
wrapper('pellin_cd34_sec.h5ad', species = 'human')
wrapper('pellin_lin_sec.h5ad', species = 'human')

In [32]:
wrapper('pellin_bbknn.h5ad', species = 'human', use_raw = True)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [33]:
wrapper('pellin_bbknn.h5ad', species = 'human', aggr_group = 'leiden',  output_fname = 'pellin_bbknn_leiden', use_raw = True)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
