In [1]:
# Tutorial: https://scar-tutorials.readthedocs.io/en/main/
import numpy as np
import pandas as pd
import scanpy as sc
from scar import model
import os

import warnings
warnings.simplefilter("ignore")

In [2]:
# Import data

# datadir_name = 'data_pbmc5knextgem'
# datadir_name = 'data_pbmc10khealthdonor'
# datadir_name = 'data_malt10k'
# datadir = '/rprojectnb2/camplab/home/yin/poisson/' + datadir_name


# Import data
# datadir_name = 'young_aged'
# datadir = '/rprojectnb2/camplab/home/yin/decontX_ADT/zhangetal/young-aged'



adata_filtered = sc.read_10x_mtx(
    datadir + '/filtered_feature_bc_matrix', 
    var_names='gene_symbols',
    gex_only= False)



# Import raw count matrix
adata_raw = sc.read_10x_mtx(
    datadir + '/raw_feature_bc_matrix',
    var_names='gene_symbols',
    gex_only= False)


In [3]:
adata_filtered.layers['counts'] = adata_filtered.X.copy()
adata_raw.layers['counts'] = adata_raw.X.copy()

In [4]:
# Split data
protein_filtered = adata_filtered[:, adata_filtered.var['feature_types'] != 'Gene Expression']
print(protein_filtered)
rna_filtered = adata_filtered[:, adata_filtered.var['feature_types'] == 'Gene Expression']
print(rna_filtered)

protein_raw = adata_raw[:, adata_raw.var['feature_types'] != 'Gene Expression']
print(protein_raw)
rna_raw = adata_raw[:, adata_raw.var['feature_types'] == 'Gene Expression']
print(rna_raw)

View of AnnData object with n_obs × n_vars = 3423 × 141
    var: 'gene_ids', 'feature_types'
    layers: 'counts'
View of AnnData object with n_obs × n_vars = 3423 × 36601
    var: 'gene_ids', 'feature_types'
    layers: 'counts'
View of AnnData object with n_obs × n_vars = 724721 × 141
    var: 'gene_ids', 'feature_types'
    layers: 'counts'
View of AnnData object with n_obs × n_vars = 724721 × 36601
    var: 'gene_ids', 'feature_types'
    layers: 'counts'


In [5]:
# Remove HTO tag

hto_f = ['HTO' in name for name in protein_filtered.var_names]
hto_f = np.array(hto_f)
protein_filtered = protein_filtered[:,~hto_f]
protein_raw = protein_raw[:,~hto_f]

print(protein_filtered)

hto_f = ['ADT' in name for name in protein_filtered.var_names]
hto_f = np.array(hto_f)
protein_filtered = protein_filtered[:,~hto_f]
protein_raw = protein_raw[:,~hto_f]

print(protein_filtered)

adt_f = np.sum(protein_filtered.X > 0, 0) > 50
protein_filtered = protein_filtered[:,adt_f]
protein_raw = protein_raw[:,adt_f]

View of AnnData object with n_obs × n_vars = 3423 × 141
    var: 'gene_ids', 'feature_types'
    layers: 'counts'
View of AnnData object with n_obs × n_vars = 3423 × 131
    var: 'gene_ids', 'feature_types'
    layers: 'counts'


In [6]:
print(protein_filtered)

View of AnnData object with n_obs × n_vars = 3423 × 130
    var: 'gene_ids', 'feature_types'
    layers: 'counts'


In [7]:
# QC raw

sc.pp.calculate_qc_metrics(rna_raw,
                           var_type='genes',
                           percent_top = None,
                           inplace=True)

# print(rna.var)
# print(rna.obs)

sc.pp.calculate_qc_metrics(protein_raw,
                           var_type='adt',
                           percent_top = None,
                           inplace=True)


flt = np.logical_and.reduce((rna_raw.obs['total_counts'] > 0,
                             protein_raw.obs['total_counts'] > 0))

protein_raw = protein_raw[flt,:]
rna_raw = rna_raw[flt,:]


In [8]:
protein_raw

View of AnnData object with n_obs × n_vars = 180230 × 130
    obs: 'n_adt_by_counts', 'log1p_n_adt_by_counts', 'total_counts', 'log1p_total_counts'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    layers: 'counts'

In [9]:
empty_flt = np.logical_and.reduce((rna_raw.obs['total_counts'] < 100,
                                   protein_raw.obs['total_counts'] < 25))
np.sum(empty_flt)

131021

In [10]:
# Remove filtered from raw to get empty

# empty_filter = ~protein_raw.obs_names.isin(protein_filtered.obs_names)
protein_empty = protein_raw[empty_flt,:]

print(protein_empty)

View of AnnData object with n_obs × n_vars = 131021 × 130
    obs: 'n_adt_by_counts', 'log1p_n_adt_by_counts', 'total_counts', 'log1p_total_counts'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    layers: 'counts'


In [11]:
# QC filtered

# calc qc metrics
rna_filtered.var['mito'] = rna_filtered.var_names.str.startswith("MT-")
print('Num of MT genes: ' + str(sum(rna_filtered.var['mito'])))

if sum(rna_filtered.var['mito']) == 0:
    rna_filtered.var['mito'] = rna_filtered.var_names.str.startswith("mt-")
    print('Num of mt genes: ' + str(sum(rna_filtered.var['mito'])))

sc.pp.calculate_qc_metrics(rna_filtered,
                           qc_vars=["mito"],
                           var_type='genes',
                           percent_top = None,
                           inplace=True)

sc.pp.calculate_qc_metrics(protein_filtered,
                           var_type='adt',
                           percent_top = None,
                           inplace=True)


Num of MT genes: 13


In [12]:
# Remove high mitocondrial percentage cells
# Remove 0 count cells
# Remove outliers

flt = np.logical_and.reduce((rna_filtered.obs['pct_counts_mito'] < 14,
                            rna_filtered.obs['total_counts'] > 0,
                            protein_filtered.obs['total_counts'] > 0,
                            rna_filtered.obs['total_counts'] > np.quantile(rna_filtered.obs['total_counts'], 0.01),
                            rna_filtered.obs['total_counts'] < np.quantile(rna_filtered.obs['total_counts'], 0.99),
                            protein_filtered.obs['total_counts'] > np.quantile(protein_filtered.obs['total_counts'], 0.01),
                            protein_filtered.obs['total_counts'] < np.quantile(protein_filtered.obs['total_counts'], 0.99)))
 
    
    

print('Num of cells after filtering: ' + str(sum(flt)))


protein_filtered = protein_filtered[flt,:]


Num of cells after filtering: 3223


In [13]:
# Apply filter

print(protein_filtered.var)

                gene_ids     feature_types  n_cells_by_counts  mean_counts  \
CD3_TotalSeqC        CD3  Antibody Capture               3263    17.103127   
CD19_TotalSeqC      CD19  Antibody Capture               2397     7.016360   
CD4_TotalSeqC        CD4  Antibody Capture               3258    22.333042   
CD14_TotalSeqC      CD14  Antibody Capture               1744     0.871166   
CD16_TotalSeqC      CD16  Antibody Capture               1830     1.652936   
...                  ...               ...                ...          ...   
HLA-E_TotalSeqC    HLA-E  Antibody Capture               3390     7.777680   
CD82_TotalSeqC      CD82  Antibody Capture               3411    15.306457   
CD101_TotalSeqC    CD101  Antibody Capture               3234     6.860649   
C5AR1_TotalSeqC    C5AR1  Antibody Capture               3164     7.378031   
GGT1_TotalSeqC      GGT1  Antibody Capture               3345    11.355828   

                 log1p_mean_counts  pct_dropout_by_counts  tota

In [14]:
ambient_profile = pd.DataFrame((protein_empty.X.sum(axis=0)/protein_empty.X.sum()).A1,
                               index = protein_empty.var_names,
                               columns = ['ambient profile'])
ambient_profile.head()

Unnamed: 0,ambient profile
CD3_TotalSeqC,0.007841
CD19_TotalSeqC,0.004462
CD4_TotalSeqC,0.011405
CD14_TotalSeqC,0.003086
CD16_TotalSeqC,0.003058


In [15]:
filtered_counts = protein_filtered.to_df()
filtered_counts.columns = filtered_counts.columns.str.replace('_TotalSeqC','')

filtered_counts.head()

Unnamed: 0,CD3,CD19,CD4,CD14,CD16,CD56,CD25,PD-1,TIGIT,IgG1_control,...,IGKC,LILRB1,FCER2,SIGLEC7,ADGRG1,HLA-E,CD82,CD101,C5AR1,GGT1
AAACCTGAGCAATCTC-1,22.0,2.0,19.0,1.0,1.0,0.0,4.0,11.0,2.0,3.0,...,1.0,2.0,4.0,0.0,3.0,8.0,11.0,9.0,4.0,6.0
AAACCTGCAAAGGCGT-1,8.0,1.0,8.0,0.0,2.0,0.0,6.0,9.0,1.0,2.0,...,3.0,1.0,5.0,0.0,0.0,5.0,9.0,1.0,0.0,3.0
AAACCTGGTAGCAAAT-1,51.0,0.0,30.0,0.0,1.0,0.0,2.0,7.0,5.0,3.0,...,4.0,2.0,0.0,0.0,1.0,1.0,6.0,3.0,3.0,4.0
AAACCTGGTGATGCCC-1,22.0,1.0,0.0,0.0,0.0,0.0,2.0,10.0,5.0,0.0,...,2.0,1.0,3.0,0.0,0.0,3.0,1.0,3.0,2.0,8.0
AAACCTGTCGACAGCC-1,17.0,0.0,2.0,2.0,1.0,0.0,1.0,15.0,8.0,1.0,...,3.0,2.0,2.0,0.0,0.0,5.0,9.0,1.0,2.0,7.0


In [None]:
ADT_scar = model(raw_count = filtered_counts,
                 ambient_profile = ambient_profile,  # Providing ambient profile is recommended for CITEseq; in other modes, you can leave this argument as the default value -- None
                 feature_type = 'ADT', # "ADT" or "ADTs" for denoising protein counts in CITE-seq
                 count_model = 'binomial'   # Depending on your data's sparsity, you can choose between 'binomial', 'possion', and 'zeroinflatedpossion'
                )

ADT_scar.train(epochs=80,
               batch_size=64,
               verbose=True
              )

# After training, we can infer the true protein signal
ADT_scar.inference()

..Running VAE using the following param set:
......denoised count type:  ADT
......count model:  binomial
......num_input_feature:  130
......NN_layer1:  150
......NN_layer2:  100
......latent_space:  15
......dropout_prob:  0
......kld_weight:  1e-05
......lr:  0.001
......lr_step_size:  5
......lr_gamma:  0.97
  Training.....
 35%|███▌      | 28/80 [00:21<00:40,  1.29it/s]

In [None]:
denoised_ADT = pd.DataFrame(ADT_scar.native_counts,
                            index=filtered_counts.index,
                            columns=filtered_counts.columns)

print(type(denoised_ADT))
print(denoised_ADT.shape)
denoised_ADT.head()


In [None]:
denoised_ADT.to_csv('scAR_denoised_'+datadir_name+'.csv')