# Aggregate single cell data to run `cell2cell`

four sets single cell data:
Pellin et.al = human haematopoietic stem cells, FAC sorted CD34 or LIN
Wolock = mouse stroma cell
Tikhonova = mouse stroma cell, FAC sorted for special population

## Steps
1. for mouse data set, converting gene symbol to human gene symbol is needed
- not exactly one-to-one relationship
- lost information

2. We've showed it's `sc.combat()` function cannot mitigate batch effect for two sets of stroma data
- need to try something more sophisticated

3. aggregate cells using louvian cluster, take mean

4. generate metadata file

In [1]:
# settings
import sys
sys.path.append('/home/hsher/scSecretome')
import numpy as np
import pandas as pd
import scanpy as sc
from scSecretome.annot import *
from scSecretome.sc import *
import anndata

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

sc.settings.set_figure_params(dpi=80)

scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.1 pandas==0.25.2 scikit-learn==0.21.3 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1
scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.1 pandas==0.25.2 scikit-learn==0.21.3 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [2]:
def exp_aggregate(filename, use_raw = False, aggr_group = 'louvain'):
    '''
    use_raw when integrated dataset
    '''
    # read scanpy from processed single cell data
    base = '/home/hsher/processed_single_cell_data/'
    adata = sc.read(base + filename)
    
    # why there are duplicated barcode? becuase the cells are processed in parallel and then concatenated!
    adata.obs_names_make_unique(join='-')
    
    # group expression value by louvain cluster
    if use_raw:
        grouped_expression = pd.DataFrame(columns = adata.raw.var.index, index = adata.obs[aggr_group].unique())
    else:   
        grouped_expression = pd.DataFrame(columns = adata.var.index, index = adata.obs[aggr_group].unique())
    for g in adata.obs.groupby(by = [aggr_group]):
        if use_raw:
            grouped_expression.loc[g[0]] = np.mean(adata.raw[g[1].index.tolist()].X, axis = 0)
        else:

            
            grouped_expression.loc[g[0]] = np.mean(adata[g[1].index, :].X, axis = 0)
    return(grouped_expression, adata.obs)

In [3]:
def wrapper(filename, output_fname = 'unspecified', aggr_group = 'louvain', use_raw = False):
    d, obs = exp_aggregate(filename, aggr_group = aggr_group, use_raw = use_raw)
        
    # transpose to fit cell2cell input
    d = d.transpose()
    d.index.name = 'Gene Symbol'
    
    # save to folder
    base = '/home/hsher/c2c_input/'
    if output_fname == 'unspecified':
        fname = filename.split('.')[0]
    else:
        fname = output_fname
    d.to_excel(base + fname + '_matrix.xlsx')
    

In [4]:
wrapper('wolock_mapped.h5ad')
wrapper('tikhonova_mapped.h5ad')
wrapper('pellin_cd34.h5ad')
wrapper('pellin_lin.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
