# Prepare signature data

## About this notebook

Preparing the dataset (skin cell atlas) to use as signature in SCADEN deconvolution of bulk samples.


### Importing and checking columns

In [1]:
# Loading packages
import anndata as ad
import scanpy as sc
import pandas as pd
import numpy as np

# Loading data
file_path = "data/healthy.h5ad"
adata = sc.read_h5ad(file_path)

In [13]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering'],
      dtype='object')

In [14]:
adata.obs.full_clustering.value_counts()

full_clustering
Differentiated_KC      32496
Undifferentiated_KC    21348
Th                     20617
VE2                    15511
moDC                   12490
migLC                  11977
F1                     10161
VE1                     9645
Tc                      8680
Macro_1                 5781
Treg                    5743
Mono_mac                4912
Pericyte_1              4705
F2                      4260
LE2                     3851
Melanocyte              3682
F3                      3035
ILC1_3                  2935
ILC1_NK                 2604
MigDC                   2080
Inf_mac                 1773
Macro_2                 1424
LE1                     1073
NK                       739
DC2                      724
VE3                      629
DC1                      604
LC                       568
Mast_cell                557
ILC2                     494
Pericyte_2               301
Schwann_1                149
Schwann_2                120
Plasma                    7

### Adjusting to more general cell type names

In [15]:
# Create more general cell type definitions
general_cells = {
    'Differentiated_KC': 'Keratinocyte',
    'Undifferentiated_KC': 'Keratinocyte',
    'Th': 'CD4 T Cell',
    'VE2': 'Vascular Endothelial',
    'moDC': 'moDendritic Cell',
    'migLC': 'Langerhans',
    'F1': 'Fibroblast',
    'VE1': 'Vascular Endothelial',
    'Tc': 'CD8 T Cell',
    'Macro_1': 'Macrophage',
    'Treg': 'Treg',
    'Mono_mac': 'Monocyte Macrophage',
    'Pericyte_1': 'Pericyte',
    'F2': 'Fibroblast',
    'LE2': 'Lymphatic Endothelial',
    'Melanocyte': 'Melanocyte',
    'F3': 'Fibroblast',
    'ILC1_3': 'ILC1 ILC3',
    'ILC1_NK': 'ILC1 NK',
    'MigDC': 'Dendritic Cell',
    'Inf_mac': 'Inflammatory Macrophage',
    'Macro_2': 'Macrophage',
    'LE1': 'Lymphatic Endothelial',
    'NK': 'NK Cell',
    'DC2': 'Dendritic Cell',
    'VE3': 'Vascular Endothelial',
    'DC1': 'Dendritic Cell',
    'LC': 'Langerhans',
    'Mast_cell': 'Mast Cell',
    'ILC2': 'ILC2',
    'Pericyte_2': 'Pericyte',
    'Schwann_1': 'Schwann',
    'Schwann_2': 'Schwann',
    'Plasma': 'Plasma'
}

In [None]:
# Replace names
adata.obs['reduced_celltypes'] = [general_cells[x] for x in adata.obs['full_clustering']]

In [18]:
# Check data
adata.obs.reduced_celltypes.value_counts()

reduced_celltypes
Keratinocyte               53844
Vascular Endothelial       25785
CD4 T Cell                 20617
Fibroblast                 17456
Langerhans                 12545
moDendritic Cell           12490
CD8 T Cell                  8680
Macrophage                  7205
Treg                        5743
Pericyte                    5006
Lymphatic Endothelial       4924
Monocyte Macrophage         4912
Melanocyte                  3682
Dendritic Cell              3408
ILC1 ILC3                   2935
ILC1 NK                     2604
Inflammatory Macrophage     1773
NK Cell                      739
Mast Cell                    557
ILC2                         494
Schwann                      269
Plasma                        71
Name: count, dtype: int64

### Creating function to subset data randomly

In [19]:
def keywiseAnndataUndersample(adata, obs_key, target_n=1000, target_prop=None):

    '''
    Subsamples adata object in a key-wise manner.
    
    Arguments:
        adata =         Anndata object.
        obs_key =       Key in adata.obs to be aware of. Usually cell types.
        target_n =      Target total number of observations to be kept in each key.
        target_prop =   Target proportion of observations to be kept in each key. (Optional)

    If only target_n is supplied: Subsets all key observations to the desired ammount. If
    observations are less than the number specified, they are kept the same.
    
    If only target_prop is supplied: Subsets all key observations to the desired 
    proportions, regardless of numbers. Must pass 'None' to target_n.

    If both are supplied: Subsets to the desired proportion ONLY if observations numbers are
    higher than target_n, acting as minimum threshold.

    '''

    adata_list = [adata[adata.obs[obs_key] == key] for key in adata.obs[obs_key].unique()]

    if target_prop is None:
        for i in adata_list:
            if i.n_obs > target_n:
                sc.pp.subsample(i, n_obs=target_n)
    elif target_n is not None:
        for i in adata_list:
            if i.n_obs > target_n:
                sc.pp.subsample(i, fraction=target_prop)
    else:
        for i in adata_list:
            sc.pp.subsample(i, fraction=target_prop)
    
    adata_ds = adata_list[0].concatenate(*adata_list[1:])
    
    return adata_ds.copy()


### Subsetting and checking value counts

In [20]:
adata_ds = keywiseAnndataUndersample(adata, 'reduced_celltypes', target_n=2000, target_prop=None)
adata_ds.obs.reduced_celltypes.value_counts()

  if not is_categorical_dtype(df_full[k]):

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
  concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique)
  if pd.api.types.is_categorical_dtype(dtype):
  concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique)
  concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique)
  if pd.api.types.is_categorical_dtype(dtype):


reduced_celltypes
Keratinocyte               2000
Macrophage                 2000
Pericyte                   2000
moDendritic Cell           2000
Monocyte Macrophage        2000
Dendritic Cell             2000
ILC1 ILC3                  2000
Melanocyte                 2000
ILC1 NK                    2000
Treg                       2000
CD8 T Cell                 2000
Vascular Endothelial       2000
CD4 T Cell                 2000
Langerhans                 2000
Fibroblast                 2000
Lymphatic Endothelial      2000
Inflammatory Macrophage    1773
NK Cell                     739
Mast Cell                   557
ILC2                        494
Schwann                     269
Plasma                       71
Name: count, dtype: int64

### Removing unclear celltypes

In [23]:
cells = ['ILC1 ILC3', 'ILC1 NK', 'ILC2']

adata_ds = adata_ds[~adata_ds.obs.reduced_celltypes.isin(cells)]

  if not is_categorical_dtype(df_full[k]):


In [61]:
## Preparing files for Scaden in omnideconv (R)

# Converting to raw and saving
raw_matrix = adata_ds.raw.X.toarray()
pd.DataFrame(data = raw_matrix, 
             index = adata_ds.obs_names, 
             columns = adata_ds.raw.var_names).to_csv("data/signature/skin_sc_matrix.csv")

# Saving obs data
adata_ds.obs.to_csv("data/signature/skin_sc_obs.csv")