## Preprocessing 10X PBMC ATAC + RNA

Following the tutorial @ https://github.com/PMBio/muon-tutorials/tree/master/single-cell-rna-atac/pbmc10k but with the data from https://raw.githack.com/bioFAM/MOFA2_tutorials/master/R_tutorials/10x_scRNA_scATAC.html

## Imports

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import muon as mu

In [2]:
# Reading R files
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
readRDS = robjects.r['readRDS']

## Load RNA

In [3]:
rna = sc.read_mtx('../datasets/PBMC_10k/original/RNA_counts.mtx')
rna

AnnData object with n_obs × n_vars = 29732 × 11909

In [4]:
rna = rna.T

In [5]:
dimnames = readRDS('../datasets/PBMC_10k/original/RNA_dimnames.RDS')
rna.var_names = np.array(dimnames[0])
rna.obs_names = np.array(dimnames[1])

## Load ATAC

In [None]:
atac = sc.read_mtx('../datasets/PBMC_10k/original/ATAC_counts.mtx')
atac

In [None]:
atac = atac.T

In [None]:
dimnames = readRDS('../datasets/PBMC_10k/original/ATAC_dimnames.RDS')
atac.var_names = np.array(dimnames[0])
atac.obs_names = np.array(dimnames[1])

## Add cell types

In [None]:
# Read cell type annotations
meta = readRDS('../datasets/PBMC_10k/original/meta.RDS')
rna.obs['celltype'] = meta['celltype'].loc[rna.obs.index]
atac.obs['celltype'] = meta['celltype'].loc[atac.obs.index]

## Preprocess RNA

In [None]:
# Mitochondrial QC metrics
rna.var['mt'] = rna.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(rna, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
# Violin plot of QC metrics
sc.pl.violin(rna, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

In [None]:
# Filter genes
mu.pp.filter_var(rna, 'n_cells_by_counts', lambda x: x >= 10)

In [None]:
# Filter cells
mu.pp.filter_obs(rna, 'n_genes_by_counts', lambda x: (x >= 500) & (x < 4_500))
mu.pp.filter_obs(rna, 'total_counts', lambda x: x < 13_000)
mu.pp.filter_obs(rna, 'pct_counts_mt', lambda x: x < 20)

In [None]:
# Violin plot of QC metrics
sc.pl.violin(rna, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

In [None]:
# Per-cell normalization
sc.pp.normalize_total(rna, target_sum=1e4)

In [None]:
# Log-normalize
sc.pp.log1p(rna)

In [None]:
# Highly variable genes
sc.pp.highly_variable_genes(rna, min_mean=0.1, max_mean=4, min_disp=0.5)
sc.pl.highly_variable_genes(rna)

In [None]:
np.sum(rna.var.highly_variable)

## Preprocess ATAC

In [None]:
# Add peak annotation
mu.atac.tl.add_peak_annotation(atac, annotation="../datasets/PBMC_10k/original/atac_peak_annotation.tsv")
atac.uns['atac']['peak_annotation']

In [None]:
# QC
sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(atac, ['total_counts', 'n_genes_by_counts'], jitter=0.4, multi_panel=True)

In [None]:
# Filter genes
mu.pp.filter_var(atac, 'n_cells_by_counts', lambda x: x >= 10)

In [None]:
# Filter cells
mu.pp.filter_obs(atac, 'n_genes_by_counts', lambda x: (x >= 1_000) & (x <= 15_000))
mu.pp.filter_obs(atac, 'total_counts', lambda x: (x >= 1_000) & (x <= 45_000))

In [None]:
# Violin again
sc.pl.violin(atac, ['n_genes_by_counts', 'total_counts'], jitter=0.4, multi_panel=True)

In [None]:
sc.pp.normalize_per_cell(atac, counts_per_cell_after=1)
#sc.pp.log1p(atac)

In [None]:
sc.pp.highly_variable_genes(atac, min_mean=4e-5, max_mean=4e-4, min_disp=0.5)
sc.pl.highly_variable_genes(atac)

In [None]:
np.sum(atac.var.highly_variable)

## Package into Muon object

In [None]:
rna = rna[~rna.obs['celltype'].isna()].copy()
atac = atac[~atac.obs['celltype'].isna()].copy()

In [None]:
# Create MuData object with rna and atac
mdata = mu.MuData({'rna': rna, 'atac': atac})

In [None]:
mu.pp.intersect_obs(mdata)

In [None]:
mdata

In [40]:
mdata.write_h5mu('../datasets/PBMC_10k/preprocessed/pbmc10k.h5mu.gz', compression='gzip')

## Visualize

In [None]:
sc.pp.neighbors(rna)
sc.tl.umap(rna)
sc.pl.umap(rna, color="celltype", legend_loc="on data")

In [None]:
sc.pp.neighbors(atac)
sc.tl.umap(atac)
sc.pl.umap(atac, color="celltype", legend_loc="on data")